From 1cdd41bd03488b38c6082c766bf819b6bc94f61c Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 24 May 2022 09:46:12 +0000
Subject: [PATCH 01/40] fix pad_sequence, test=asr

---
 paddlespeech/s2t/utils/tensor_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py
index 0dbaa0b6b..e105253c2 100644
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@@ -82,7 +82,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
     max_size = sequences[0].size()
     # (TODO Hui Zhang): slice not supprot `end==start`
     # trailing_dims = max_size[1:]
-    trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
+    trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
     max_len = max([s.shape[0] for s in sequences])
     if batch_first:
         out_dims = (len(sequences), max_len) + trailing_dims
@@ -99,7 +99,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
         if batch_first:
             # TODO (Hui Zhang): set_value op not supprot `end==start`
             # TODO (Hui Zhang): set_value op not support int16
-            # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] 
+            # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
             # out_tensor[i, :length, ...] = tensor
             if length != 0:
                 out_tensor[i, :length] = tensor
@@ -145,7 +145,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
                 [ 4,  5,  6, 11, -1, -1],
                 [ 7,  8,  9, 11, -1, -1]])
     """
-    # TODO(Hui Zhang): using comment code, 
+    # TODO(Hui Zhang): using comment code,
     #_sos = paddle.to_tensor(
     #    [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
     #_eos = paddle.to_tensor(

From e1888f9ae6d239b8c28f9739f7fd2a0120caac9e Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 24 May 2022 12:37:42 +0000
Subject: [PATCH 02/40] remove size,test=asr

---
 paddlespeech/s2t/__init__.py                  | 19 -------------
 .../s2t/decoders/beam_search/beam_search.py   | 10 +++----
 paddlespeech/s2t/decoders/scorers/ctc.py      |  4 +--
 .../s2t/decoders/scorers/ctc_prefix_score.py  | 27 +++++++++----------
 paddlespeech/s2t/models/u2/u2.py              |  2 +-
 paddlespeech/s2t/modules/decoder.py           |  2 +-
 paddlespeech/s2t/modules/embedding.py         |  4 +--
 paddlespeech/s2t/utils/tensor_utils.py        |  6 ++---
 8 files changed, 27 insertions(+), 47 deletions(-)

diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 2365071f3..7ec9e1aba 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -189,25 +189,6 @@ if not hasattr(paddle.Tensor, 'contiguous'):
     paddle.static.Variable.contiguous = contiguous
 
 
-def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
-    nargs = len(args)
-    assert (nargs <= 1)
-    s = paddle.shape(xs)
-    if nargs == 1:
-        return s[args[0]]
-    else:
-        return s
-
-
-#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.debug(
-    "override size of paddle.Tensor "
-    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
-)
-paddle.Tensor.size = size
-paddle.static.Variable.size = size
-
-
 def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
     return xs.reshape(args)
 
diff --git a/paddlespeech/s2t/decoders/beam_search/beam_search.py b/paddlespeech/s2t/decoders/beam_search/beam_search.py
index f331cb1c9..5029e1577 100644
--- a/paddlespeech/s2t/decoders/beam_search/beam_search.py
+++ b/paddlespeech/s2t/decoders/beam_search/beam_search.py
@@ -194,7 +194,7 @@ class BeamSearch(paddle.nn.Layer):
 
         Args:
             hyp (Hypothesis): Hypothesis with prefix tokens to score
-            ids (paddle.Tensor): 1D tensor of new partial tokens to score, 
+            ids (paddle.Tensor): 1D tensor of new partial tokens to score,
                 len(ids) < n_vocab
             x (paddle.Tensor): Corresponding input feature, (T, D)
 
@@ -224,14 +224,14 @@ class BeamSearch(paddle.nn.Layer):
             ids (paddle.Tensor): The partial token ids(Global) to compute topk.
 
         Returns:
-            Tuple[paddle.Tensor, paddle.Tensor]: 
+            Tuple[paddle.Tensor, paddle.Tensor]:
                 The topk full token ids and partial token ids.
                 Their shapes are `(self.beam_size,)`.
                 i.e. (global ids, global relative local ids).
 
         """
         # no pre beam performed, `ids` equal to `weighted_scores`
-        if weighted_scores.size(0) == ids.size(0):
+        if weighted_scores.shape[0] == ids.shape[0]:
             top_ids = weighted_scores.topk(
                 self.beam_size)[1]  # index in n_vocab
             return top_ids, top_ids
@@ -374,8 +374,8 @@ class BeamSearch(paddle.nn.Layer):
         elif maxlenratio < 0:
             maxlen = -1 * int(maxlenratio)
         else:
-            maxlen = max(1, int(maxlenratio * x.size(0)))
-        minlen = int(minlenratio * x.size(0))
+            maxlen = max(1, int(maxlenratio * x.shape[0]))
+        minlen = int(minlenratio * x.shape[0])
         logger.info("decoder input length: " + str(x.shape[0]))
         logger.info("max output length: " + str(maxlen))
         logger.info("min output length: " + str(minlen))
diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py
index 81d8b0783..6f1d8c007 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc.py
@@ -69,7 +69,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
                 return sc[i], st[i]
             else:  # for CTCPrefixScorePD (need new_id > 0)
                 r, log_psi, f_min, f_max, scoring_idmap = state
-                s = log_psi[i, new_id].expand(log_psi.size(1))
+                s = log_psi[i, new_id].expand(log_psi.shape[1])
                 if scoring_idmap is not None:
                     return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
                 else:
@@ -107,7 +107,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
 
         """
         logp = self.ctc.log_softmax(x.unsqueeze(0))  # assuming batch_size = 1
-        xlen = paddle.to_tensor([logp.size(1)])
+        xlen = paddle.to_tensor([logp.shape[1]])
         self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos)
         return None
 
diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
index 78b8fe36c..0e63a52a8 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@@ -33,9 +33,9 @@ class CTCPrefixScorePD():
         self.logzero = -10000000000.0
         self.blank = blank
         self.eos = eos
-        self.batch = x.size(0)
-        self.input_length = x.size(1)
-        self.odim = x.size(2)
+        self.batch = x.shape[0]
+        self.input_length = x.shape[1]
+        self.odim = x.shape[2]
         self.dtype = x.dtype
 
         # Pad the rest of posteriors in the batch
@@ -76,8 +76,7 @@ class CTCPrefixScorePD():
         last_ids = [yi[-1] for yi in y]  # last output label ids
         n_bh = len(last_ids)  # batch * hyps
         n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
-        self.scoring_num = scoring_ids.size(
-            -1) if scoring_ids is not None else 0
+        self.scoring_num = scoring_ids.shape[-1] if scoring_ids is not None else 0
         # prepare state info
         if state is None:
             r_prev = paddle.full(
@@ -153,7 +152,7 @@ class CTCPrefixScorePD():
 
         # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
         for t in range(start, end):
-            rp = r[t - 1]  # (2 x BW x O') 
+            rp = r[t - 1]  # (2 x BW x O')
             rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
                 2, 2, n_bh, snum)  # (2,2,BW,O')
             r[t] = paddle.logsumexp(rr, 1) + x_[:, t]
@@ -227,7 +226,7 @@ class CTCPrefixScorePD():
         if self.x.shape[1] < x.shape[1]:  # self.x (2,T,B,O); x (B,T,O)
             # Pad the rest of posteriors in the batch
             # TODO(takaaki-hori): need a better way without for-loops
-            xlens = [x.size(1)]
+            xlens = [x.shape[1]]
             for i, l in enumerate(xlens):
                 if l < self.input_length:
                     x[i, l:, :] = self.logzero
@@ -237,7 +236,7 @@ class CTCPrefixScorePD():
             xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
             self.x = paddle.stack([xn, xb])  # (2, T, B, O)
             self.x[:, :tmp_x.shape[1], :, :] = tmp_x
-            self.input_length = x.size(1)
+            self.input_length = x.shape[1]
             self.end_frames = paddle.to_tensor(xlens) - 1
 
     def extend_state(self, state):
@@ -318,16 +317,16 @@ class CTCPrefixScore():
             r[0, 0] = xs[0]
             r[0, 1] = self.logzero
         else:
-            # Although the code does not exactly follow Algorithm 2, 
-            # we don't have to change it because we can assume 
-            # r_t(h)=0 for t < |h| in CTC forward computation 
+            # Although the code does not exactly follow Algorithm 2,
+            # we don't have to change it because we can assume
+            # r_t(h)=0 for t < |h| in CTC forward computation
             # (Note: we assume here that index t starts with 0).
             # The purpose of this difference is to reduce the number of for-loops.
             # https://github.com/espnet/espnet/pull/3655
-            # where we start to accumulate r_t(h) from t=|h| 
-            # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1, 
+            # where we start to accumulate r_t(h) from t=|h|
+            # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1,
             # avoiding accumulating zeros for t=1~|h|-1.
-            # Thus, we need to set r_{|h|-1}(h) = 0, 
+            # Thus, we need to set r_{|h|-1}(h) = 0,
             # i.e., r[output_length-1] = logzero, for initialization.
             # This is just for reducing the computation.
             r[output_length - 1] = self.logzero
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 530840d0f..e3f46b15a 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -775,7 +775,7 @@ class U2DecodeModel(U2BaseModel):
         """
         self.eval()
         x = paddle.to_tensor(x).unsqueeze(0)
-        ilen = x.size(1)
+        ilen = x.shape[1]
         enc_output, _ = self._forward_encoder(x, ilen)
         return enc_output.squeeze(0)
 
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index 42ac119b4..ce78059c0 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -242,7 +242,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
             ]
 
         # batch decoding
-        ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0)  # (B,L,L)
+        ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0)  # (B,L,L)
         xs_mask = make_xs_mask(xs).unsqueeze(1)  # (B,1,T)
         logp, states = self.forward_one_step(
             xs, xs_mask, ys, ys_mask, cache=batch_state)
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index 596f61b78..cc1fdffe2 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -115,7 +115,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
         assert offset + x.shape[
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
-        #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
+        #TODO(Hui Zhang): using T = x.shape[1], __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + T]
         x = x * self.xscale + pos_emb
         return self.dropout(x), self.dropout(pos_emb)
@@ -165,6 +165,6 @@ class RelPositionalEncoding(PositionalEncoding):
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
         x = x * self.xscale
-        #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
+        #TODO(Hui Zhang): using x.shape[1], __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + x.shape[1]]
         return self.dropout(x), self.dropout(pos_emb)
diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py
index e105253c2..ca8689569 100644
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@@ -58,8 +58,8 @@ def pad_sequence(sequences: List[paddle.Tensor],
         >>> a = paddle.ones(25, 300)
         >>> b = paddle.ones(22, 300)
         >>> c = paddle.ones(15, 300)
-        >>> pad_sequence([a, b, c]).size()
-        paddle.Tensor([25, 3, 300])
+        >>> pad_sequence([a, b, c]).shape
+        [25, 3, 300]
 
     Note:
         This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
@@ -79,7 +79,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
 
     # assuming trailing dimensions and type of all the Tensors
     # in sequences are same and fetching those from sequences[0]
-    max_size = sequences[0].size()
+    max_size = sequences[0].shape
     # (TODO Hui Zhang): slice not supprot `end==start`
     # trailing_dims = max_size[1:]
     trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()

From 4c09927f61668952ee263cd178798b0ea5634760 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 24 May 2022 13:34:01 +0000
Subject: [PATCH 03/40] fix

---
 paddlespeech/s2t/__init__.py              | 2 +-
 paddlespeech/s2t/models/lm/transformer.py | 4 ++--
 paddlespeech/s2t/modules/encoder.py       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 7ec9e1aba..a2fce3057 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -200,7 +200,7 @@ if not hasattr(paddle.Tensor, 'view'):
 
 
 def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
-    return xs.reshape(ys.size())
+    return xs.reshape(ys.shape)
 
 
 if not hasattr(paddle.Tensor, 'view_as'):
diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py
index 85bd7c232..bb281168f 100644
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
 
     def _target_mask(self, ys_in_pad):
         ys_mask = ys_in_pad != 0
-        m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0)
+        m = subsequent_mask(ys_mask.shape[-1])).unsqueeze(0)
         return ys_mask.unsqueeze(-2) & m
 
     def forward(self, x: paddle.Tensor, t: paddle.Tensor
@@ -112,7 +112,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
             in perplexity: p(t)^{-n} = exp(-log p(t) / n)
 
         """
-        batch_size = x.size(0)
+        batch_size = x.shape[0]
         xm = x != 0
         xlen = xm.sum(axis=1)
         if self.embed_drop is not None:
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 669a12d65..7298c61f2 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -218,7 +218,7 @@ class BaseEncoder(nn.Layer):
         assert xs.shape[0] == 1  # batch size must be one
         # tmp_masks is just for interface compatibility
         # TODO(Hui Zhang): stride_slice not support bool tensor
-        # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        # tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
         tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
         tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
 

From b23bde8ec5ff4ed3990f151246dfbb8c9dccf385 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 25 May 2022 03:30:48 +0000
Subject: [PATCH 04/40] tensor.shape => paddle.shape(tensor)

---
 paddlespeech/s2t/__init__.py                         |  2 +-
 paddlespeech/s2t/decoders/beam_search/beam_search.py | 10 +++++-----
 paddlespeech/s2t/decoders/scorers/ctc.py             |  4 ++--
 .../s2t/decoders/scorers/ctc_prefix_score.py         | 12 ++++++------
 paddlespeech/s2t/models/lm/transformer.py            |  6 +++---
 paddlespeech/s2t/models/u2/u2.py                     |  2 +-
 paddlespeech/s2t/modules/decoder.py                  |  2 +-
 paddlespeech/s2t/modules/embedding.py                |  4 ++--
 paddlespeech/s2t/modules/encoder.py                  |  2 +-
 paddlespeech/s2t/utils/tensor_utils.py               |  4 ++--
 10 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index a2fce3057..2da68435c 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -200,7 +200,7 @@ if not hasattr(paddle.Tensor, 'view'):
 
 
 def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
-    return xs.reshape(ys.shape)
+    return xs.reshape(paddle.shape(ys))
 
 
 if not hasattr(paddle.Tensor, 'view_as'):
diff --git a/paddlespeech/s2t/decoders/beam_search/beam_search.py b/paddlespeech/s2t/decoders/beam_search/beam_search.py
index 5029e1577..f6a2b4b0a 100644
--- a/paddlespeech/s2t/decoders/beam_search/beam_search.py
+++ b/paddlespeech/s2t/decoders/beam_search/beam_search.py
@@ -231,7 +231,7 @@ class BeamSearch(paddle.nn.Layer):
 
         """
         # no pre beam performed, `ids` equal to `weighted_scores`
-        if weighted_scores.shape[0] == ids.shape[0]:
+        if paddle.shape(weighted_scores)[0] == paddle.shape(ids)[0]:
             top_ids = weighted_scores.topk(
                 self.beam_size)[1]  # index in n_vocab
             return top_ids, top_ids
@@ -370,13 +370,13 @@ class BeamSearch(paddle.nn.Layer):
         """
         # set length bounds
         if maxlenratio == 0:
-            maxlen = x.shape[0]
+            maxlen = paddle.shape(x)[0]
         elif maxlenratio < 0:
             maxlen = -1 * int(maxlenratio)
         else:
-            maxlen = max(1, int(maxlenratio * x.shape[0]))
-        minlen = int(minlenratio * x.shape[0])
-        logger.info("decoder input length: " + str(x.shape[0]))
+            maxlen = max(1, int(maxlenratio * paddle.shape(x)[0]))
+        minlen = int(minlenratio * paddle.shape(x)[0])
+        logger.info("decoder input length: " + str(paddle.shape(x)[0]))
         logger.info("max output length: " + str(maxlen))
         logger.info("min output length: " + str(minlen))
 
diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py
index 6f1d8c007..3c1d4cf80 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc.py
@@ -69,7 +69,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
                 return sc[i], st[i]
             else:  # for CTCPrefixScorePD (need new_id > 0)
                 r, log_psi, f_min, f_max, scoring_idmap = state
-                s = log_psi[i, new_id].expand(log_psi.shape[1])
+                s = log_psi[i, new_id].expand(paddle.shape(log_psi)[1])
                 if scoring_idmap is not None:
                     return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
                 else:
@@ -107,7 +107,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
 
         """
         logp = self.ctc.log_softmax(x.unsqueeze(0))  # assuming batch_size = 1
-        xlen = paddle.to_tensor([logp.shape[1]])
+        xlen = paddle.to_tensor([paddle.shape(logp)[1]])
         self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos)
         return None
 
diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
index 0e63a52a8..d8ca5ccde 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@@ -33,9 +33,9 @@ class CTCPrefixScorePD():
         self.logzero = -10000000000.0
         self.blank = blank
         self.eos = eos
-        self.batch = x.shape[0]
-        self.input_length = x.shape[1]
-        self.odim = x.shape[2]
+        self.batch = paddle.shape(x)[0]
+        self.input_length = paddle.shape(x)[1]
+        self.odim = paddle.shape(x)[2]
         self.dtype = x.dtype
 
         # Pad the rest of posteriors in the batch
@@ -76,7 +76,7 @@ class CTCPrefixScorePD():
         last_ids = [yi[-1] for yi in y]  # last output label ids
         n_bh = len(last_ids)  # batch * hyps
         n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
-        self.scoring_num = scoring_ids.shape[-1] if scoring_ids is not None else 0
+        self.scoring_num = paddle.shape(scoring_ids)[-1] if scoring_ids is not None else 0
         # prepare state info
         if state is None:
             r_prev = paddle.full(
@@ -226,7 +226,7 @@ class CTCPrefixScorePD():
         if self.x.shape[1] < x.shape[1]:  # self.x (2,T,B,O); x (B,T,O)
             # Pad the rest of posteriors in the batch
             # TODO(takaaki-hori): need a better way without for-loops
-            xlens = [x.shape[1]]
+            xlens = [paddle.shape(x)[1]]
             for i, l in enumerate(xlens):
                 if l < self.input_length:
                     x[i, l:, :] = self.logzero
@@ -236,7 +236,7 @@ class CTCPrefixScorePD():
             xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
             self.x = paddle.stack([xn, xb])  # (2, T, B, O)
             self.x[:, :tmp_x.shape[1], :, :] = tmp_x
-            self.input_length = x.shape[1]
+            self.input_length = paddle.shape(x)[1]
             self.end_frames = paddle.to_tensor(xlens) - 1
 
     def extend_state(self, state):
diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py
index bb281168f..d14f99563 100644
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
 
     def _target_mask(self, ys_in_pad):
         ys_mask = ys_in_pad != 0
-        m = subsequent_mask(ys_mask.shape[-1])).unsqueeze(0)
+        m = subsequent_mask(paddle.shape(ys_mask)[-1])).unsqueeze(0)
         return ys_mask.unsqueeze(-2) & m
 
     def forward(self, x: paddle.Tensor, t: paddle.Tensor
@@ -112,7 +112,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
             in perplexity: p(t)^{-n} = exp(-log p(t) / n)
 
         """
-        batch_size = x.shape[0]
+        batch_size = paddle.shape(x)[0]
         xm = x != 0
         xlen = xm.sum(axis=1)
         if self.embed_drop is not None:
@@ -122,7 +122,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
         h, _ = self.encoder(emb, xlen)
         y = self.decoder(h)
         loss = F.cross_entropy(
-            y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
+            y.view(-1, paddle.shape(y)[-1]), t.view(-1), reduction="none")
         mask = xm.to(loss.dtype)
         logp = loss * mask.view(-1)
         nll = logp.view(batch_size, -1).sum(-1)
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index e3f46b15a..d5471369f 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -775,7 +775,7 @@ class U2DecodeModel(U2BaseModel):
         """
         self.eval()
         x = paddle.to_tensor(x).unsqueeze(0)
-        ilen = x.shape[1]
+        ilen = paddle.shape(x)[1]
         enc_output, _ = self._forward_encoder(x, ilen)
         return enc_output.squeeze(0)
 
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index ce78059c0..ccc8482d5 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -242,7 +242,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
             ]
 
         # batch decoding
-        ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0)  # (B,L,L)
+        ys_mask = subsequent_mask(paddle.shape(ys)[-1]).unsqueeze(0)  # (B,L,L)
         xs_mask = make_xs_mask(xs).unsqueeze(1)  # (B,1,T)
         logp, states = self.forward_one_step(
             xs, xs_mask, ys, ys_mask, cache=batch_state)
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index cc1fdffe2..51e558eb8 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -115,7 +115,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
         assert offset + x.shape[
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
-        #TODO(Hui Zhang): using T = x.shape[1], __getitem__ not support Tensor
+        #TODO(Hui Zhang): using T = paddle.shape(x)[1], __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + T]
         x = x * self.xscale + pos_emb
         return self.dropout(x), self.dropout(pos_emb)
@@ -165,6 +165,6 @@ class RelPositionalEncoding(PositionalEncoding):
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
         x = x * self.xscale
-        #TODO(Hui Zhang): using x.shape[1], __getitem__ not support Tensor
+        #TODO(Hui Zhang): using paddle.shape(x)[1], __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + x.shape[1]]
         return self.dropout(x), self.dropout(pos_emb)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 7298c61f2..4d31acf1a 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -218,7 +218,7 @@ class BaseEncoder(nn.Layer):
         assert xs.shape[0] == 1  # batch size must be one
         # tmp_masks is just for interface compatibility
         # TODO(Hui Zhang): stride_slice not support bool tensor
-        # tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
+        # tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool)
         tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
         tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
 
diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py
index ca8689569..bc557b130 100644
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@@ -59,7 +59,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
         >>> b = paddle.ones(22, 300)
         >>> c = paddle.ones(15, 300)
         >>> pad_sequence([a, b, c]).shape
-        [25, 3, 300]
+        paddle.Tensor([25, 3, 300])
 
     Note:
         This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
@@ -79,7 +79,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
 
     # assuming trailing dimensions and type of all the Tensors
     # in sequences are same and fetching those from sequences[0]
-    max_size = sequences[0].shape
+    max_size = paddle.shape(sequences[0])
     # (TODO Hui Zhang): slice not supprot `end==start`
     # trailing_dims = max_size[1:]
     trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()

From 6f7917b7f2b489b8341aeda2c8ff318975b84f78 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 25 May 2022 09:25:17 +0000
Subject: [PATCH 05/40] fix streaming asr

---
 .../conf/ws_conformer_application.yaml        |  2 +-
 ...plication.yaml => ws_ds2_application.yaml} |  0
 .../server/engine/asr/online/asr_engine.py    | 53 ++++---------------
 3 files changed, 12 insertions(+), 43 deletions(-)
 rename demos/streaming_asr_server/conf/{ws_application.yaml => ws_ds2_application.yaml} (100%)

diff --git a/demos/streaming_asr_server/conf/ws_conformer_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_application.yaml
index 2affde073..6a10741bd 100644
--- a/demos/streaming_asr_server/conf/ws_conformer_application.yaml
+++ b/demos/streaming_asr_server/conf/ws_conformer_application.yaml
@@ -4,7 +4,7 @@
 #                             SERVER SETTING                                    #
 #################################################################################
 host: 0.0.0.0
-port: 8090
+port: 8091
 
 # The task format in the engin_list is: <speech task>_<engine type>
 # task choices = ['asr_online']
diff --git a/demos/streaming_asr_server/conf/ws_application.yaml b/demos/streaming_asr_server/conf/ws_ds2_application.yaml
similarity index 100%
rename from demos/streaming_asr_server/conf/ws_application.yaml
rename to demos/streaming_asr_server/conf/ws_ds2_application.yaml
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index 70bfcfb66..d7bd458f8 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -53,7 +53,7 @@ class PaddleASRConnectionHanddler:
         logger.info(
             "create an paddle asr connection handler to process the websocket connection"
         )
-        self.config = asr_engine.config
+        self.config = asr_engine.config # server config
         self.model_config = asr_engine.executor.config
         self.asr_engine = asr_engine
 
@@ -249,10 +249,13 @@ class PaddleASRConnectionHanddler:
     def reset(self):
         if "deepspeech2" in self.model_type:
             # for deepspeech2 
-            self.chunk_state_h_box = copy.deepcopy(
-                self.asr_engine.executor.chunk_state_h_box)
-            self.chunk_state_c_box = copy.deepcopy(
-                self.asr_engine.executor.chunk_state_c_box)
+            # init state
+            self.chunk_state_h_box = np.zeros(
+                (self.model_config .num_rnn_layers, 1, self.model_config.rnn_layer_size),
+                dtype=float32)
+            self.chunk_state_c_box = np.zeros(
+                (self.model_config.num_rnn_layers, 1, self.model_config.rnn_layer_size),
+                dtype=float32)
             self.decoder.reset_decoder(batch_size=1)
 
         self.device = None
@@ -803,36 +806,6 @@ class ASRServerExecutor(ASRExecutor):
                 model_file=self.am_model,
                 params_file=self.am_params,
                 predictor_conf=self.am_predictor_conf)
-
-            # decoder
-            logger.info("ASR engine start to create the ctc decoder instance")
-            self.decoder = CTCDecoder(
-                odim=self.config.output_dim,  # <blank> is in  vocab
-                enc_n_units=self.config.rnn_layer_size * 2,
-                blank_id=self.config.blank_id,
-                dropout_rate=0.0,
-                reduction=True,  # sum
-                batch_average=True,  # sum / batch_size
-                grad_norm_type=self.config.get('ctc_grad_norm_type', None))
-
-            # init decoder
-            logger.info("ASR engine start to init the ctc decoder")
-            cfg = self.config.decode
-            decode_batch_size = 1  # for online
-            self.decoder.init_decoder(
-                decode_batch_size, self.text_feature.vocab_list,
-                cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
-                cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
-                cfg.num_proc_bsearch)
-
-            # init state box
-            self.chunk_state_h_box = np.zeros(
-                (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
-                dtype=float32)
-            self.chunk_state_c_box = np.zeros(
-                (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
-                dtype=float32)
-
         elif "conformer" in model_type or "transformer" in model_type:
             model_name = model_type[:model_type.rindex(
                 '_')]  # model_type: {model_name}_{dataset}
@@ -847,15 +820,11 @@ class ASRServerExecutor(ASRExecutor):
             model_dict = paddle.load(self.am_model)
             self.model.set_state_dict(model_dict)
             logger.info("create the transformer like model success")
-
-            # update the ctc decoding
-            self.searcher = CTCPrefixBeamSearch(self.config.decode)
-            self.transformer_decode_reset()
         else:
             raise ValueError(f"Not support: {model_type}")
 
         return True
-
+        
 
 class ASREngine(BaseEngine):
     """ASR server resource
@@ -881,8 +850,8 @@ class ASREngine(BaseEngine):
         self.executor = ASRServerExecutor()
 
         try:
-            default_dev = paddle.get_device()
-            paddle.set_device(self.config.get("device", default_dev))
+            self.device = self.config.get("device", paddle.get_device())
+            paddle.set_device(self.device)
         except BaseException as e:
             logger.error(
                 f"Set device failed, please check if device '{self.device}' is already used and the parameter 'device' in the yaml file"

From f9f014d159e28efa788f4d241794420716d369ad Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 25 May 2022 10:39:28 +0000
Subject: [PATCH 06/40] add VITS readme, test=tts

---
 examples/aishell3/tts3/README.md              |  30 ++--
 examples/aishell3/voc1/README.md              |   2 +-
 examples/aishell3/voc5/README.md              |  21 +--
 examples/csmsc/tts0/README.md                 |  30 ++--
 examples/csmsc/tts2/README.md                 |  30 ++--
 examples/csmsc/tts3/README.md                 |  31 ++--
 examples/csmsc/tts3/README_cn.md              |  30 ++--
 examples/csmsc/vits/README.md                 | 146 ++++++++++++++++++
 examples/csmsc/voc1/README.md                 |   2 +-
 examples/csmsc/voc3/README.md                 |   2 +-
 examples/csmsc/voc4/README.md                 |   2 +-
 examples/csmsc/voc5/README.md                 |   2 +-
 examples/csmsc/voc6/README.md                 |   2 +-
 examples/ljspeech/tts0/README.md              |  30 ++--
 examples/ljspeech/tts1/README.md              |   2 +-
 examples/ljspeech/tts3/README.md              |  30 ++--
 examples/ljspeech/voc1/README.md              |   2 +-
 examples/ljspeech/voc5/README.md              |  21 +--
 examples/vctk/tts3/README.md                  |  30 ++--
 examples/vctk/voc1/README.md                  |   2 +-
 examples/vctk/voc5/README.md                  |  21 +--
 .../t2s/exps/gan_vocoder/hifigan/train.py     |   3 +-
 .../gan_vocoder/multi_band_melgan/train.py    |   2 +-
 .../gan_vocoder/parallelwave_gan/train.py     |   2 +-
 .../exps/gan_vocoder/style_melgan/train.py    |   3 +-
 .../t2s/exps/transformer_tts/train.py         |   2 +-
 paddlespeech/t2s/exps/vits/train.py           |   5 +-
 paddlespeech/t2s/exps/wavernn/train.py        |   3 +-
 28 files changed, 285 insertions(+), 203 deletions(-)

diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index d02ad1b63..93ce62c96 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -120,12 +120,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -134,11 +134,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -150,10 +149,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -169,12 +168,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -184,11 +183,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -199,10 +197,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -215,9 +213,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index eb30e7c40..503f8a19d 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -75,7 +75,7 @@ Train a ParallelWaveGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       ParallelWaveGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md
index c957c4a3a..f8f28f409 100644
--- a/examples/aishell3/voc5/README.md
+++ b/examples/aishell3/voc5/README.md
@@ -67,15 +67,13 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                [--run-benchmark RUN_BENCHMARK]
-                [--profiler_options PROFILER_OPTIONS]
+                [--ngpu NGPU]
 
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       HiFiGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
@@ -83,19 +81,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-
-benchmark:
-  arguments related to benchmark.
-
-  --batch-size BATCH_SIZE
-                        batch size.
-  --max-iter MAX_ITER   train max steps.
-  --run-benchmark RUN_BENCHMARK
-                        runing benchmark or not, if True, use the --batch-size
-                        and --max-iter.
-  --profiler_options PROFILER_OPTIONS
-                        The option of profiler, which should be in format
-                        "key1=value1;key2=value2;key3=value3".
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md
index 01376bd61..a337c7d45 100644
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -133,10 +132,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -182,10 +180,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -198,9 +196,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 081d85848..553a370c9 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -139,10 +138,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -188,10 +186,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -204,9 +202,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index c734199b4..be18de7d6 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -111,12 +111,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -125,11 +125,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -141,10 +140,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -160,12 +159,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -175,11 +174,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -190,10 +188,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -204,11 +202,12 @@ optional arguments:
   --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
   --output_dir OUTPUT_DIR
                         output dir.
+
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md
index 25931ecb1..a88615134 100644
--- a/examples/csmsc/tts3/README_cn.md
+++ b/examples/csmsc/tts3/README_cn.md
@@ -117,12 +117,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -131,11 +131,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -147,10 +146,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -167,12 +166,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -182,11 +181,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -197,10 +195,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -213,9 +211,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` 声学模型格式是否符合 {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数，对应于 fastspeech2 预训练模型中的 4 个文件。
+2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数，对应于 fastspeech2 预训练模型中的 4 个文件。
 3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数，对应于 parallel wavegan 预训练模型中的 3 个文件。
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数，对应于 parallel wavegan 预训练模型中的 3 个文件。
 5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。
 6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、
 7. `--text` 是文本文件，其中包含要合成的句子。
diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md
index e69de29bb..0c16840a0 100644
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -0,0 +1,146 @@
+# VITS with CSMSC
+This example contains code used to train a [VITS](https://arxiv.org/abs/2106.06103) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for VITS, the durations of MFA are not needed here.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from a text file.
+
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── feats_stats.npy
+    ├── norm
+    └── raw
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave and linear spectrogram of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, feats, feats_lengths, the path of linear spectrogram features, the path of raw waves, speaker, and the id of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a VITS model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+
+### Synthesizing
+
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--config CONFIG] [--ckpt CKPT]
+                     [--phones_dict PHONES_DICT] [--ngpu NGPU]
+                     [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+
+Synthesize with VITS
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       Config of VITS.
+  --ckpt CKPT           Checkpoint file of VITS.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --test_metadata TEST_METADATA
+                        test metadata.
+  --output_dir OUTPUT_DIR
+                        output dir.
+```
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h] [--config CONFIG] [--ckpt CKPT]
+                         [--phones_dict PHONES_DICT] [--lang LANG]
+                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--text TEXT] [--output_dir OUTPUT_DIR]
+
+Synthesize with VITS
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       Config of VITS.
+  --ckpt CKPT           Checkpoint file of VITS.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --lang LANG           Choose model language. zh or en
+  --inference_dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
+  --output_dir OUTPUT_DIR
+                        output dir.
+```
+1. `--config`, `--ckpt`, and `--phones_dict` are arguments for acoustic model, which correspond to the 3 files in the VITS pretrained model.
+2. `--lang` is the model language, which can be `zh` or `en`.
+3. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
+4. `--text` is the text file, which contains sentences to synthesize.
+5. `--output_dir` is the directory to save synthesized audio files.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Model
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index 77da5b185..d19fe8497 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -65,7 +65,7 @@ Train a ParallelWaveGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       ParallelWaveGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index 12adaf7f4..eb7710362 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -63,7 +63,7 @@ Train a Multi-Band MelGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       Multi-Band MelGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md
index b7add3e57..d9e86a88d 100644
--- a/examples/csmsc/voc4/README.md
+++ b/examples/csmsc/voc4/README.md
@@ -63,7 +63,7 @@ Train a Style MelGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       Style MelGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md
index 94f93b48b..e044a0c74 100644
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -63,7 +63,7 @@ Train a HiFiGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       HiFiGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md
index 7dcf133bd..f1a5ec3bb 100644
--- a/examples/csmsc/voc6/README.md
+++ b/examples/csmsc/voc6/README.md
@@ -63,7 +63,7 @@ Train a WaveRNN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       WaveRNN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index ba7ad6193..581f7930f 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -133,10 +132,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -182,10 +180,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -198,9 +196,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 7f32522ac..f85991cba 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -61,7 +61,7 @@ Train a TransformerTTS model with LJSpeech TTS dataset.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       TransformerTTS config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index e028fa05d..a6724083d 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ``text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -139,10 +138,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -188,10 +186,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -204,9 +202,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 4513b2a05..6fd6cbe24 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -65,7 +65,7 @@ Train a ParallelWaveGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       ParallelWaveGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md
index 9b31e2650..afc1bb8be 100644
--- a/examples/ljspeech/voc5/README.md
+++ b/examples/ljspeech/voc5/README.md
@@ -57,15 +57,13 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                [--run-benchmark RUN_BENCHMARK]
-                [--profiler_options PROFILER_OPTIONS]
+                [--ngpu NGPU]
 
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       HiFiGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
@@ -73,19 +71,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-
-benchmark:
-  arguments related to benchmark.
-
-  --batch-size BATCH_SIZE
-                        batch size.
-  --max-iter MAX_ITER   train max steps.
-  --run-benchmark RUN_BENCHMARK
-                        runing benchmark or not, if True, use the --batch-size
-                        and --max-iter.
-  --profiler_options PROFILER_OPTIONS
-                        The option of profiler, which should be in format
-                        "key1=value1;key2=value2;key3=value3".
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index f373ca6a3..379f5c0fd 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -112,12 +112,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -126,11 +126,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -142,10 +141,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -161,12 +160,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -176,11 +175,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -191,10 +189,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -207,9 +205,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 1c3016f88..c4c40d1d0 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -70,7 +70,7 @@ Train a ParallelWaveGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       ParallelWaveGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md
index 4eb25c02d..c53d46325 100644
--- a/examples/vctk/voc5/README.md
+++ b/examples/vctk/voc5/README.md
@@ -62,15 +62,13 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                [--run-benchmark RUN_BENCHMARK]
-                [--profiler_options PROFILER_OPTIONS]
+                [--ngpu NGPU]
 
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       HiFiGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
@@ -78,19 +76,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-
-benchmark:
-  arguments related to benchmark.
-
-  --batch-size BATCH_SIZE
-                        batch size.
-  --max-iter MAX_ITER   train max steps.
-  --run-benchmark RUN_BENCHMARK
-                        runing benchmark or not, if True, use the --batch-size
-                        and --max-iter.
-  --profiler_options PROFILER_OPTIONS
-                        The option of profiler, which should be in format
-                        "key1=value1;key2=value2;key3=value3".
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
index c70821e78..4c733dc9b 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
@@ -243,8 +243,7 @@ def main():
     # parse args and config and redirect to train_sp
 
     parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
-    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--config", type=str, help="HiFiGAN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
index 27ffded63..3b3ebb478 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -233,7 +233,7 @@ def main():
     parser = argparse.ArgumentParser(
         description="Train a Multi-Band MelGAN model.")
     parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+        "--config", type=str, help="Multi-Band MelGAN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
index 92de7a2c4..b26407028 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -208,7 +208,7 @@ def main():
     parser = argparse.ArgumentParser(
         description="Train a ParallelWaveGAN model.")
     parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+        "--config", type=str, help="ParallelWaveGAN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
index be3ba7425..a87cc7a18 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
@@ -224,8 +224,7 @@ def main():
     # parse args and config and redirect to train_sp
 
     parser = argparse.ArgumentParser(description="Train a Style MelGAN model.")
-    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--config", type=str, help="Style MelGAN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index 45ecb269b..da48b6b99 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -160,7 +160,7 @@ def main():
     parser = argparse.ArgumentParser(description="Train a TransformerTTS "
                                      "model with LJSpeech TTS dataset.")
     parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+        "--config", type=str, help="TransformerTTS config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index b921f92af..dbda8b717 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -226,9 +226,8 @@ def train_sp(args, config):
 def main():
     # parse args and config and redirect to train_sp
 
-    parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
-    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+    parser = argparse.ArgumentParser(description="Train a VITS model.")
+    parser.add_argument("--config", type=str, help="VITS config file")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py
index 8661d311d..cf24ea268 100644
--- a/paddlespeech/t2s/exps/wavernn/train.py
+++ b/paddlespeech/t2s/exps/wavernn/train.py
@@ -180,8 +180,7 @@ def main():
     # parse args and config and redirect to train_sp
 
     parser = argparse.ArgumentParser(description="Train a WaveRNN model.")
-    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--config", type=str, help="WaveRNN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")

From 537aff9704c5c61e8f5bc334486599996279fa82 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Wed, 25 May 2022 20:54:11 +0800
Subject: [PATCH 07/40] refactor example dir & add aishell build TLG

---
 speechx/examples/dev/CMakeLists.txt           |   3 -
 speechx/examples/dev/glog/CMakeLists.txt      |   8 -
 speechx/examples/dev/glog/README.md           |  25 ---
 .../dev/glog/glog_logtostderr_test.cc         |  25 ---
 speechx/examples/dev/glog/glog_test.cc        |  23 ---
 speechx/examples/dev/glog/path.sh             |  15 --
 speechx/examples/dev/glog/run.sh              |  22 ---
 speechx/examples/ds2_ol/aishell/README.md     |  37 ++++
 .../aishell}/local/aishell_train_lms.sh       |   0
 .../aishell}/local/text_to_lexicon.py         |   0
 speechx/examples/ds2_ol/aishell/path.sh       |  12 +-
 .../examples/ds2_ol/aishell/run_build_tlg.sh  | 141 +++++++++++++
 speechx/examples/ds2_ol/aishell/run_fbank.sh  |   1 -
 speechx/examples/ngram/.gitignore             |   2 -
 speechx/examples/ngram/en/README.md           |   0
 speechx/examples/ngram/zh/README.md           | 101 ----------
 speechx/examples/ngram/zh/local/split_data.sh |  30 ---
 speechx/examples/ngram/zh/path.sh             |  12 --
 speechx/examples/ngram/zh/run.sh              |  68 -------
 speechx/examples/ngram/zh/utils               |   1 -
 speechx/examples/wfst/.gitignore              |   1 -
 speechx/examples/wfst/README.md               | 186 ------------------
 speechx/examples/wfst/path.sh                 |  19 --
 speechx/examples/wfst/run.sh                  |  29 ---
 speechx/examples/wfst/utils                   |   1 -
 25 files changed, 189 insertions(+), 573 deletions(-)
 delete mode 100644 speechx/examples/dev/CMakeLists.txt
 delete mode 100644 speechx/examples/dev/glog/CMakeLists.txt
 delete mode 100644 speechx/examples/dev/glog/README.md
 delete mode 100644 speechx/examples/dev/glog/glog_logtostderr_test.cc
 delete mode 100644 speechx/examples/dev/glog/glog_test.cc
 delete mode 100644 speechx/examples/dev/glog/path.sh
 delete mode 100755 speechx/examples/dev/glog/run.sh
 rename speechx/examples/{ngram/zh => ds2_ol/aishell}/local/aishell_train_lms.sh (100%)
 rename speechx/examples/{ngram/zh => ds2_ol/aishell}/local/text_to_lexicon.py (100%)
 create mode 100755 speechx/examples/ds2_ol/aishell/run_build_tlg.sh
 delete mode 100644 speechx/examples/ngram/.gitignore
 delete mode 100644 speechx/examples/ngram/en/README.md
 delete mode 100644 speechx/examples/ngram/zh/README.md
 delete mode 100755 speechx/examples/ngram/zh/local/split_data.sh
 delete mode 100644 speechx/examples/ngram/zh/path.sh
 delete mode 100755 speechx/examples/ngram/zh/run.sh
 delete mode 120000 speechx/examples/ngram/zh/utils
 delete mode 100644 speechx/examples/wfst/.gitignore
 delete mode 100644 speechx/examples/wfst/README.md
 delete mode 100644 speechx/examples/wfst/path.sh
 delete mode 100755 speechx/examples/wfst/run.sh
 delete mode 120000 speechx/examples/wfst/utils

diff --git a/speechx/examples/dev/CMakeLists.txt b/speechx/examples/dev/CMakeLists.txt
deleted file mode 100644
index c8445fb82..000000000
--- a/speechx/examples/dev/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_subdirectory(glog)
diff --git a/speechx/examples/dev/glog/CMakeLists.txt b/speechx/examples/dev/glog/CMakeLists.txt
deleted file mode 100644
index b4b0e6358..000000000
--- a/speechx/examples/dev/glog/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_executable(glog_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_test.cc)
-target_link_libraries(glog_test glog)
-
-
-add_executable(glog_logtostderr_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_logtostderr_test.cc)
-target_link_libraries(glog_logtostderr_test glog)
\ No newline at end of file
diff --git a/speechx/examples/dev/glog/README.md b/speechx/examples/dev/glog/README.md
deleted file mode 100644
index 996e192e9..000000000
--- a/speechx/examples/dev/glog/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# [GLOG](https://rpg.ifi.uzh.ch/docs/glog.html)
-
-Unless otherwise specified, glog writes to the filename `/tmp/<program name>.<hostname>.<user name>.log.<severity level>.<date>.<time>.<pid>` (e.g., "/tmp/hello_world.example.com.hamaji.log.INFO.20080709-222411.10474"). By default, glog copies the log messages of severity level ERROR or FATAL to standard error (stderr) in addition to log files.
-
-Several flags influence glog's output behavior. If the Google gflags library is installed on your machine, the configure script (see the INSTALL file in the package for detail of this script) will automatically detect and use it, allowing you to pass flags on the command line. For example, if you want to turn the flag --logtostderr on, you can start your application with the following command line:
-
-   `./your_application --logtostderr=1`
-
-If the Google gflags library isn't installed, you set flags via environment variables, prefixing the flag name with "GLOG_", e.g.
-
-   `GLOG_logtostderr=1 ./your_application`
-
-You can also modify flag values in your program by modifying global variables `FLAGS_*` . Most settings start working immediately after you update `FLAGS_*` . The exceptions are the flags related to destination files. For example, you might want to set `FLAGS_log_dir` before calling `google::InitGoogleLogging` . Here is an example:
-∂∂
-```c++
-   LOG(INFO) << "file";
-   // Most flags work immediately after updating values.
-   FLAGS_logtostderr = 1;
-   LOG(INFO) << "stderr";
-   FLAGS_logtostderr = 0;
-   // This won't change the log destination. If you want to set this
-   // value, you should do this before google::InitGoogleLogging .
-   FLAGS_log_dir = "/some/log/directory";
-   LOG(INFO) << "the same file";
-```
diff --git a/speechx/examples/dev/glog/glog_logtostderr_test.cc b/speechx/examples/dev/glog/glog_logtostderr_test.cc
deleted file mode 100644
index b0616a7de..000000000
--- a/speechx/examples/dev/glog/glog_logtostderr_test.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-
-int main(int argc, char* argv[]) {
-    // Initialize Google’s logging library.
-    google::InitGoogleLogging(argv[0]);
-
-    FLAGS_logtostderr = 1;
-
-    LOG(INFO) << "Found " << 10 << " cookies";
-    LOG(ERROR) << "Found " << 10 << " error";
-}
\ No newline at end of file
diff --git a/speechx/examples/dev/glog/glog_test.cc b/speechx/examples/dev/glog/glog_test.cc
deleted file mode 100644
index b6275119e..000000000
--- a/speechx/examples/dev/glog/glog_test.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-
-int main(int argc, char* argv[]) {
-    // Initialize Google’s logging library.
-    google::InitGoogleLogging(argv[0]);
-
-    LOG(INFO) << "Found " << 10 << " cookies";
-    LOG(ERROR) << "Found " << 10 << " error";
-}
\ No newline at end of file
diff --git a/speechx/examples/dev/glog/path.sh b/speechx/examples/dev/glog/path.sh
deleted file mode 100644
index 1a96a861a..000000000
--- a/speechx/examples/dev/glog/path.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-SPEECHX_ROOT=$PWD/../../../
-
-SPEECHX_TOOLS=$SPEECHX_ROOT/tools
-TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
-
-SPEECHX_BIN=$SPEECHX_EXAMPLES/dev/glog
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
-
-export LC_AL=C
diff --git a/speechx/examples/dev/glog/run.sh b/speechx/examples/dev/glog/run.sh
deleted file mode 100755
index d3fcdb643..000000000
--- a/speechx/examples/dev/glog/run.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-set +x
-set -e
-
-. ./path.sh
-
-# 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
-    pushd ${SPEECHX_ROOT} 
-    bash build.sh
-    popd
-fi
-
-# 2. run 
-glog_test
-
-echo "------"
-export FLAGS_logtostderr=1 
-glog_test
-
-echo "------"
-glog_logtostderr_test
diff --git a/speechx/examples/ds2_ol/aishell/README.md b/speechx/examples/ds2_ol/aishell/README.md
index 1ed8a67c2..3e7af9244 100644
--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@@ -42,3 +42,40 @@ Overall -> 10.93 % N=104765 C=93410 S=9780 D=1575 I=95
 Mandarin -> 10.93 % N=104762 C=93410 S=9779 D=1573 I=95
 Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
 ```
+
+## fbank
+```
+bash run_fbank.sh
+```
+
+### CTC Prefix Beam Search w/o LM
+
+```
+Overall -> 10.44 % N=104765 C=94194 S=10174 D=397 I=369
+Mandarin -> 10.44 % N=104762 C=94194 S=10171 D=397 I=369
+Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
+```
+
+### CTC Prefix Beam Search w/ LM
+
+LM: zh_giga.no_cna_cmn.prune01244.klm
+
+```
+Overall -> 5.82 % N=104765 C=99386 S=4944 D=435 I=720
+Mandarin -> 5.82 % N=104762 C=99386 S=4941 D=435 I=720
+English -> 0.00 % N=0 C=0 S=0 D=0 I=0
+```
+
+### CTC WFST
+
+LM: [aishell train](https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph2.zip)
+```
+Overall -> 9.58 % N=104765 C=94817 S=4326 D=5622 I=84
+Mandarin -> 9.57 % N=104762 C=94817 S=4325 D=5620 I=84
+Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
+```
+
+## build TLG graph 
+```
+ bash run_build_tlg.sh
+```
diff --git a/speechx/examples/ngram/zh/local/aishell_train_lms.sh b/speechx/examples/ds2_ol/aishell/local/aishell_train_lms.sh
similarity index 100%
rename from speechx/examples/ngram/zh/local/aishell_train_lms.sh
rename to speechx/examples/ds2_ol/aishell/local/aishell_train_lms.sh
diff --git a/speechx/examples/ngram/zh/local/text_to_lexicon.py b/speechx/examples/ds2_ol/aishell/local/text_to_lexicon.py
similarity index 100%
rename from speechx/examples/ngram/zh/local/text_to_lexicon.py
rename to speechx/examples/ds2_ol/aishell/local/text_to_lexicon.py
diff --git a/speechx/examples/ds2_ol/aishell/path.sh b/speechx/examples/ds2_ol/aishell/path.sh
index 520129eaf..1807a277a 100755
--- a/speechx/examples/ds2_ol/aishell/path.sh
+++ b/speechx/examples/ds2_ol/aishell/path.sh
@@ -1,5 +1,6 @@
 # This contains the locations of binarys build required for running the examples.
 
+MAIN_ROOT=`realpath $PWD/../../../../`
 SPEECHX_ROOT=$PWD/../../..
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
 
@@ -10,5 +11,14 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
 export LC_AL=C
 
+# openfst bin & kaldi bin
+KALDI_DIR=$SPEECHX_ROOT/build/speechx/kaldi/
+OPENFST_DIR=$SPEECHX_ROOT/fc_patch/openfst-build/src
+
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+
 SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat:$SPEECHX_EXAMPLES/ds2_ol/websocket
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN:${SRILM}/bin:${SRILM}/bin/i686-m64:$KALDI_DIR/lmbin:$KALDI_DIR/fstbin:$OPENFST_DIR/bin
diff --git a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
new file mode 100755
index 000000000..68a31de4f
--- /dev/null
+++ b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+set -eo pipefail
+
+. path.sh
+
+# attention, please replace the vocab is only for this script. 
+# different acustic model has different vocab
+ckpt_dir=data/fbank_model
+unit=$ckpt_dir/data/lang_char/vocab.txt       # vocab file, line: char/spm_pice
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+
+stage=-1
+stop_stage=100
+corpus=aishell
+lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
+text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+
+. utils/parse_options.sh
+
+data=$PWD/data
+mkdir -p $data
+
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
+    if [ ! -f $data/speech.ngram.zh.tar.gz ];then
+        pushd $data
+        wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
+        tar xvzf speech.ngram.zh.tar.gz
+        popd
+    fi
+
+    if [ ! -f $ckpt_dir/data/mean_std.json ]; then
+        mkdir -p $ckpt_dir
+        pushd $ckpt_dir
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz
+        tar xzfv WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz
+        popd
+    fi
+fi
+
+if [ ! -f $unit ]; then
+    echo "$0: No such file $unit"
+    exit 1;
+fi
+
+if ! which ngram-count; then
+    pushd $MAIN_ROOT/tools
+    make srilm.done
+    popd
+fi
+
+mkdir -p data/local/dict
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # Prepare dict
+    # line: char/spm_pices
+    cp $unit data/local/dict/units.txt
+
+    if [ ! -f $lexicon ];then
+       local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
+        echo "Generate $lexicon from $text"
+    fi
+
+    # filter by vocab
+    # line: word ph0 ... phn -> line: word char0 ... charn
+    utils/fst/prepare_dict.py \
+        --unit_file $unit \
+        --in_lexicon ${lexicon} \
+        --out_lexicon data/local/dict/lexicon.txt
+fi
+
+lm=data/local/lm
+mkdir -p $lm
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # Train lm
+    cp $text $lm/text
+    local/aishell_train_lms.sh
+    echo "build LM done."
+fi
+
+# build TLG
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  # build T & L
+  utils/fst/compile_lexicon_token_fst.sh \
+      data/local/dict data/local/tmp data/local/lang
+ 
+  # build G & TLG
+  utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+
+fi
+
+aishell_wav_scp=aishell_test.scp
+nj=40
+cmvn=$data/cmvn_fbank.ark
+wfst=$data/lang_test
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+
+    if [ ! -d $data/test ]; then
+        pushd $data
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
+        unzip  aishell_test.zip
+        popd
+
+        realpath $data/test/*/*.wav > $data/wavlist
+        awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+        paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+    fi
+
+    ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+    cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+fi
+
+wer=aishell_wer
+label_file=aishell_result
+export GLOG_logtostderr=1
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    #  TLG decoder
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/check_tlg.log \
+    recognizer_test_main \
+        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+        --cmvn_file=$cmvn \
+        --model_path=$model_dir/avg_5.jit.pdmodel \
+        --streaming_chunk=30 \
+        --use_fbank=true \
+        --param_path=$model_dir/avg_5.jit.pdiparams \
+        --word_symbol_table=$wfst/words.txt \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --model_cache_shapes="5-1-2048,5-1-2048" \
+        --graph_path=$wfst/TLG.fst --max_active=7500 \
+        --acoustic_scale=1.2 \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_check_tlg
+
+    cat $data/split${nj}/*/result_check_tlg > $exp/${label_file}_check_tlg
+    utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_check_tlg > $exp/${wer}.check_tlg
+    echo "recognizer test have finished!!!"
+    echo "please checkout in ${exp}/${wer}.check_tlg"
+fi
+
+exit 0
diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
index 3d4825ace..483fbfdfe 100755
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -154,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
         --model_path=$model_dir/avg_5.jit.pdmodel \
         --streaming_chunk=30 \
         --use_fbank=true \
-        --to_float32=false \
         --param_path=$model_dir/avg_5.jit.pdiparams \
         --word_symbol_table=$wfst/words.txt \
         --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
diff --git a/speechx/examples/ngram/.gitignore b/speechx/examples/ngram/.gitignore
deleted file mode 100644
index bbd86a25b..000000000
--- a/speechx/examples/ngram/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-data
-exp
diff --git a/speechx/examples/ngram/en/README.md b/speechx/examples/ngram/en/README.md
deleted file mode 100644
index e69de29bb..000000000
diff --git a/speechx/examples/ngram/zh/README.md b/speechx/examples/ngram/zh/README.md
deleted file mode 100644
index e11bd3439..000000000
--- a/speechx/examples/ngram/zh/README.md
+++ /dev/null
@@ -1,101 +0,0 @@
-# ngram train for mandarin
-
-Quick run:
-```
-bash run.sh --stage -1
-```
-
-## input
-
-input files:
-```
-data/
-├── lexicon.txt
-├── text
-└── vocab.txt
-```
-
-```
-==> data/text <==
-BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
-BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
-BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
-BAC009S0002W0125 各地 政府 便 纷纷 跟进
-BAC009S0002W0126 仅 一 个 多 月 的 时间 里
-BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
-BAC009S0002W0128 四十六 个 限 购 城市 当中
-BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
-BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
-BAC009S0002W0131 显示 出 了 极 强 的 威力
-
-==> data/lexicon.txt <==
-SIL sil
-<SPOKEN_NOISE> sil
-啊 aa a1
-啊 aa a2
-啊 aa a4
-啊 aa a5
-啊啊啊 aa a2 aa a2 aa a2
-啊啊啊 aa a5 aa a5 aa a5
-坐地 z uo4 d i4
-坐实 z uo4 sh ix2
-坐视 z uo4 sh ix4
-坐稳 z uo4 uu un3
-坐拥 z uo4 ii iong1
-坐诊 z uo4 zh en3
-坐庄 z uo4 zh uang1
-坐姿 z uo4 z iy1
-
-==> data/vocab.txt <==
-<blank>
-<unk>
-A
-B
-C
-D
-E
-龙
-龚
-龛
-<eos>
-```
-
-## output
-
-```
-data/
-├── local
-│   ├── dict
-│   │   ├── lexicon.txt
-│   │   └── units.txt
-│   └── lm
-│       ├── heldout
-│       ├── lm.arpa
-│       ├── text
-│       ├── text.no_oov
-│       ├── train
-│       ├── unigram.counts
-│       ├── word.counts
-│       └── wordlist
-```
-
-```
-/workspace/srilm/bin/i686-m64/ngram-count
-Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
-Ignoring words 矽, which contains oov unit
-Ignoring words 傩, which contains oov unit
-Ignoring words 堀, which contains oov unit
-Ignoring words 莼, which contains oov unit
-Ignoring words 菰, which contains oov unit
-Ignoring words 摭, which contains oov unit
-Ignoring words 帙, which contains oov unit
-Ignoring words 迨, which contains oov unit
-Ignoring words 孥, which contains oov unit
-Ignoring words 瑗, which contains oov unit
-...
-...
-...
-file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
-0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
-build LM done.
-```
diff --git a/speechx/examples/ngram/zh/local/split_data.sh b/speechx/examples/ngram/zh/local/split_data.sh
deleted file mode 100755
index 2af6fc5ab..000000000
--- a/speechx/examples/ngram/zh/local/split_data.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-
-set -eo pipefail
-
-data=$1
-scp=$2
-split_name=$3
-numsplit=$4
-
-# save in $data/split{n}
-# $scp to split
-# 
-
-if [[ ! $numsplit -gt 0 ]]; then
-  echo "Invalid num-split argument";
-  exit 1;
-fi
-
-directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
-scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done)
-
-# if this mkdir fails due to argument-list being too long, iterate.
-if ! mkdir -p $directories >&/dev/null; then
-  for n in `seq $numsplit`; do
-    mkdir -p $data/split${numsplit}/$n
-  done
-fi
-
-echo "utils/split_scp.pl $scp $scp_splits"
-utils/split_scp.pl $scp $scp_splits
diff --git a/speechx/examples/ngram/zh/path.sh b/speechx/examples/ngram/zh/path.sh
deleted file mode 100644
index a3fb3d758..000000000
--- a/speechx/examples/ngram/zh/path.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-MAIN_ROOT=`realpath $PWD/../../../../`
-SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
-
-export LC_AL=C
-
-# srilm
-export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
-export SRILM=${MAIN_ROOT}/tools/srilm
-export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
diff --git a/speechx/examples/ngram/zh/run.sh b/speechx/examples/ngram/zh/run.sh
deleted file mode 100755
index f24ad0a7c..000000000
--- a/speechx/examples/ngram/zh/run.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-set -eo pipefail
-
-. path.sh
-
-stage=-1
-stop_stage=100
-corpus=aishell
-
-unit=data/vocab.txt       # vocab file, line: char/spm_pice
-lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
-text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
-
-. utils/parse_options.sh
-
-data=$PWD/data
-mkdir -p $data
-
-if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
-    if [ ! -f $data/speech.ngram.zh.tar.gz ];then
-        pushd $data
-        wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
-        tar xvzf speech.ngram.zh.tar.gz
-        popd
-    fi
-fi
-
-if [ ! -f $unit ]; then
-    echo "$0: No such file $unit"
-    exit 1;
-fi
-
-if ! which ngram-count; then
-    pushd $MAIN_ROOT/tools
-    make srilm.done
-    popd
-fi
-
-mkdir -p data/local/dict
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # 7.1 Prepare dict
-    # line: char/spm_pices
-    cp $unit data/local/dict/units.txt
-
-    if [ ! -f $lexicon ];then
-        local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
-        echo "Generate $lexicon from $text"
-    fi
-
-    # filter by vocab
-    # line: word ph0 ... phn -> line: word char0 ... charn
-    utils/fst/prepare_dict.py \
-        --unit_file $unit \
-        --in_lexicon ${lexicon} \
-        --out_lexicon data/local/dict/lexicon.txt
-fi
-
-lm=data/local/lm
-mkdir -p $lm
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # 7.2 Train lm
-    cp $text $lm/text
-    local/aishell_train_lms.sh
-fi
-
-echo "build LM done."
-exit 0
diff --git a/speechx/examples/ngram/zh/utils b/speechx/examples/ngram/zh/utils
deleted file mode 120000
index c2519a9dd..000000000
--- a/speechx/examples/ngram/zh/utils
+++ /dev/null
@@ -1 +0,0 @@
-../../../../utils/
\ No newline at end of file
diff --git a/speechx/examples/wfst/.gitignore b/speechx/examples/wfst/.gitignore
deleted file mode 100644
index 1269488f7..000000000
--- a/speechx/examples/wfst/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-data
diff --git a/speechx/examples/wfst/README.md b/speechx/examples/wfst/README.md
deleted file mode 100644
index d0bdac0fc..000000000
--- a/speechx/examples/wfst/README.md
+++ /dev/null
@@ -1,186 +0,0 @@
-# Built TLG wfst
-
-## Input
-```
-data/local/
-├── dict
-│   ├── lexicon.txt
-│   └── units.txt
-└── lm
-    ├── heldout
-    ├── lm.arpa
-    ├── text
-    ├── text.no_oov
-    ├── train
-    ├── unigram.counts
-    ├── word.counts
-    └── wordlist
-```
-
-```
-==> data/local/dict/lexicon.txt <==
-啊 啊
-啊啊啊 啊 啊 啊
-阿 阿
-阿尔 阿 尔
-阿根廷 阿 根 廷
-阿九 阿 九
-阿克 阿 克
-阿拉伯数字 阿 拉 伯 数 字
-阿拉法特 阿 拉 法 特
-阿拉木图 阿 拉 木 图
-
-==> data/local/dict/units.txt <==
-<blank>
-<unk>
-A
-B
-C
-D
-E
-F
-G
-H
-
-==> data/local/lm/heldout <==
-而 对 楼市 成交 抑制 作用 最 大 的 限 购
-也 成为 地方 政府 的 眼中 钉
-自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
-各地 政府 便 纷纷 跟进
-仅 一 个 多 月 的 时间 里
-除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
-四十六 个 限 购 城市 当中
-四十一 个 已 正式 取消 或 变相 放松 了 限 购
-财政 金融 政策 紧随 其后 而来
-显示 出 了 极 强 的 威力
-
-==> data/local/lm/lm.arpa <==
-
-\data\
-ngram 1=129356
-ngram 2=504661
-ngram 3=123455
-
-\1-grams:
--1.531278       </s>
--3.828829       <SPOKEN_NOISE>  -0.1600094
--6.157292       <UNK>
-
-==> data/local/lm/text <==
-BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
-BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
-BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
-BAC009S0002W0125 各地 政府 便 纷纷 跟进
-BAC009S0002W0126 仅 一 个 多 月 的 时间 里
-BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
-BAC009S0002W0128 四十六 个 限 购 城市 当中
-BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
-BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
-BAC009S0002W0131 显示 出 了 极 强 的 威力
-
-==> data/local/lm/text.no_oov <==
-<SPOKEN_NOISE> 而 对 楼市 成交 抑制 作用 最 大 的 限 购 
-<SPOKEN_NOISE> 也 成为 地方 政府 的 眼中 钉 
-<SPOKEN_NOISE> 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 
-<SPOKEN_NOISE> 各地 政府 便 纷纷 跟进 
-<SPOKEN_NOISE> 仅 一 个 多 月 的 时间 里 
-<SPOKEN_NOISE> 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 
-<SPOKEN_NOISE> 四十六 个 限 购 城市 当中 
-<SPOKEN_NOISE> 四十一 个 已 正式 取消 或 变相 放松 了 限 购 
-<SPOKEN_NOISE> 财政 ���融 政策 紧随 其后 而来 
-<SPOKEN_NOISE> 显示 出 了 极 强 的 威力 
-
-==> data/local/lm/train <==
-汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点
-并 计划 朝云 计算 方面 发展
-汉莎 的 基础 设施 部门 拥有 一千四百 名 员工
-媒体 就 曾 披露 这笔 交易
-虽然 双方 已经 正式 签署 了 外包 协议
-但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准
-陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位
-并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程
-曾 在 多家 国际 公司 任职
-拥有 业务 开发 商务 及 企业 治理
-
-==> data/local/lm/unigram.counts <==
-  57487 的
-  13099 在
-  11862 一
-  11397 了
-  10998 不
-   9913 是
-   7952 有
-   6250 和
-   6152 个
-   5422 将
-
-==> data/local/lm/word.counts <==
-  57486 的
-  13098 在
-  11861 一
-  11396 了
-  10997 不
-   9912 是
-   7951 有
-   6249 和
-   6151 个
-   5421 将
-
-==> data/local/lm/wordlist <==
-的
-在
-一
-了
-不
-是
-有
-和
-个
-将
-```
-
-## Output
-
-```
-fstaddselfloops 'echo 4234 |' 'echo 123660 |' 
-Lexicon and Token FSTs compiling succeeded
-arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - 
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
-Checking how stochastic G is (the first of these numbers should be small):
-fstisstochastic data/lang_test/G.fst 
-0 -1.14386
-fsttablecompose data/lang_test/L.fst data/lang_test/G.fst 
-fstminimizeencoded 
-fstdeterminizestar --use-log=true 
-fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst 
-Composing decoding graph TLG.fst succeeded
-Aishell build TLG done.
-```
-
-```
-data/
-├── lang_test
-│   ├── G.fst
-│   ├── L.fst
-│   ├── LG.fst
-│   ├── T.fst
-│   ├── TLG.fst
-│   ├── tokens.txt
-│   ├── units.txt
-│   └── words.txt
-└── local
-    ├── lang
-    │   ├── L.fst
-    │   ├── T.fst
-    │   ├── tokens.txt
-    │   ├── units.txt
-    │   └── words.txt
-    └── tmp
-        ├── disambig.list
-        ├── lexiconp_disambig.txt
-        ├── lexiconp.txt
-        └── units.list
-```
diff --git a/speechx/examples/wfst/path.sh b/speechx/examples/wfst/path.sh
deleted file mode 100644
index a07c1297d..000000000
--- a/speechx/examples/wfst/path.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-MAIN_ROOT=`realpath $PWD/../../../`
-SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
-
-export LC_AL=C
-
-# srilm
-export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
-export SRILM=${MAIN_ROOT}/tools/srilm
-export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
-
-# Kaldi
-export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
-export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
-[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
-[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
diff --git a/speechx/examples/wfst/run.sh b/speechx/examples/wfst/run.sh
deleted file mode 100755
index 1354646af..000000000
--- a/speechx/examples/wfst/run.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-set -eo pipefail
-
-. path.sh
-
-stage=-1
-stop_stage=100
-
-. utils/parse_options.sh
-
-if ! which fstprint ; then
-    pushd $MAIN_ROOT/tools
-    make kaldi.done
-    popd
-fi
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
-    # build T & L
-    # utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>
-    utils/fst/compile_lexicon_token_fst.sh \
-        data/local/dict data/local/tmp data/local/lang
-
-    # build G & LG & TLG
-    # utils/fst/make_tlg.sh <lm_dir> <src_lang> <tgt_lang>
-    utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
-fi
-
-echo "build TLG done."
-exit 0
diff --git a/speechx/examples/wfst/utils b/speechx/examples/wfst/utils
deleted file mode 120000
index 256f914ab..000000000
--- a/speechx/examples/wfst/utils
+++ /dev/null
@@ -1 +0,0 @@
-../../../utils/
\ No newline at end of file

From f852514a3ef31b32f583c17bc282e4e0db809719 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Wed, 25 May 2022 21:25:01 +0800
Subject: [PATCH 08/40] mv text_to_lexicon.py to utils

---
 .../examples/ds2_ol/aishell/run_build_tlg.sh  |  2 +-
 utils/text_to_lexicon.py                      | 37 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100755 utils/text_to_lexicon.py

diff --git a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
index 68a31de4f..4394ac5a0 100755
--- a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
+++ b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
@@ -55,7 +55,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     cp $unit data/local/dict/units.txt
 
     if [ ! -f $lexicon ];then
-       local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
+       utils/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
         echo "Generate $lexicon from $text"
     fi
 
diff --git a/utils/text_to_lexicon.py b/utils/text_to_lexicon.py
new file mode 100755
index 000000000..ba5ab60ac
--- /dev/null
+++ b/utils/text_to_lexicon.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+import argparse
+from collections import Counter
+
+
+def main(args):
+    counter = Counter()
+    with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
+        for line in fin:
+            line = line.strip()
+            if args.has_key:
+                utt, text = line.split(maxsplit=1)
+                words = text.split()
+            else:
+                words = line.split()
+
+            counter.update(words)
+
+        for word in counter:
+            val = " ".join(list(word))
+            fout.write(f"{word}\t{val}\n")
+            fout.flush()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='text(line:utt1 中国 人) to lexicon（line:中国 中 国).')
+    parser.add_argument(
+        '--has_key', default=True, help='text path, with utt or not')
+    parser.add_argument(
+        '--text', required=True, help='text path. line: utt1 中国 人 or 中国 人')
+    parser.add_argument(
+        '--lexicon', required=True, help='lexicon path. line:中国 中 国')
+    args = parser.parse_args()
+    print(args)
+
+    main(args)

From 6c57c2bf8e3568ab5518731de113d075467aeb9a Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 25 May 2022 21:32:14 +0800
Subject: [PATCH 09/40] Dynamic cli commands registration.

---
 paddlespeech/cli/__init__.py      |  7 -------
 paddlespeech/cli/asr/infer.py     |  3 ---
 paddlespeech/cli/base_commands.py | 18 ++++++++++++++++++
 paddlespeech/cli/cls/infer.py     |  5 +----
 paddlespeech/cli/entry.py         |  5 +++++
 paddlespeech/cli/st/infer.py      |  3 ---
 paddlespeech/cli/text/infer.py    |  2 --
 paddlespeech/cli/tts/infer.py     |  3 ---
 paddlespeech/cli/utils.py         | 11 +++++++++++
 paddlespeech/cli/vector/infer.py  |  6 +-----
 10 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index ddf0359bc..ca6993f2b 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -13,14 +13,7 @@
 # limitations under the License.
 import _locale
 
-from .asr import ASRExecutor
 from .base_commands import BaseCommand
 from .base_commands import HelpCommand
-from .cls import CLSExecutor
-from .st import STExecutor
-from .stats import StatsExecutor
-from .text import TextExecutor
-from .tts import TTSExecutor
-from .vector import VectorExecutor
 
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 2d74afa6d..09e8202fd 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -29,7 +29,6 @@ from yacs.config import CfgNode
 from ..download import get_path_from_url
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import CLI_TIMER
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
@@ -45,8 +44,6 @@ __all__ = ['ASRExecutor']
 
 
 @timer_register
-@cli_register(
-    name='paddlespeech.asr', description='Speech to text infer command.')
 class ASRExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
index 0a26b1203..4d4d2cc69 100644
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@@ -15,6 +15,7 @@ from typing import List
 
 from .entry import commands
 from .utils import cli_register
+from .utils import explicit_command_register
 from .utils import get_command
 
 __all__ = [
@@ -73,3 +74,20 @@ class VersionCommand:
 
         print(msg)
         return True
+
+
+# Dynamic import when running specific command
+_commands = {
+    'asr': ['Speech to text infer command.', 'ASRExecutor'],
+    'cls': ['Audio classification infer command.', 'CLSExecutor'],
+    'st': ['Speech translation infer command.', 'STExecutor'],
+    'text': ['Text command.', 'TextExecutor'],
+    'tts': ['Text to Speech infer command.', 'TTSExecutor'],
+    'vector': ['Speech to vector embedding infer command.', 'VectorExecutor'],
+}
+
+for com, info in _commands.items():
+    explicit_command_register(
+        name='paddlespeech.{}'.format(com),
+        description=info[0],
+        cls='paddlespeech.cli.{}.{}'.format(com, info[1]))
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 40072d997..3d807b60b 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -27,7 +27,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
@@ -36,8 +35,6 @@ from .pretrained_models import pretrained_models
 __all__ = ['CLSExecutor']
 
 
-@cli_register(
-    name='paddlespeech.cls', description='Audio classification infer command.')
 class CLSExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
@@ -246,4 +243,4 @@ class CLSExecutor(BaseExecutor):
         self.infer()
         res = self.postprocess(topk)  # Retrieve result of cls.
 
-        return res
\ No newline at end of file
+        return res
diff --git a/paddlespeech/cli/entry.py b/paddlespeech/cli/entry.py
index 32123ece7..e0c306d62 100644
--- a/paddlespeech/cli/entry.py
+++ b/paddlespeech/cli/entry.py
@@ -34,6 +34,11 @@ def _execute():
     # The method 'execute' of a command instance returns 'True' for a success
     # while 'False' for a failure. Here converts this result into a exit status
     # in bash: 0 for a success and 1 for a failure.
+    if not callable(com['_entry']):
+        i = com['_entry'].rindex('.')
+        module, cls = com['_entry'][:i], com['_entry'][i + 1:]
+        exec("from {} import {}".format(module, cls))
+        com['_entry'] = locals()[cls]
     status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
     return status
 
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index 4f210fbe6..ae188b349 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -28,7 +28,6 @@ from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
@@ -42,8 +41,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
 __all__ = ["STExecutor"]
 
 
-@cli_register(
-    name="paddlespeech.st", description="Speech translation infer command.")
 class STExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index 97f3bbe21..be5b5a10d 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -23,7 +23,6 @@ import paddle
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
@@ -33,7 +32,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
 __all__ = ['TextExecutor']
 
 
-@cli_register(name='paddlespeech.text', description='Text infer command.')
 class TextExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index efab9cb25..5fa9b3ed0 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -28,7 +28,6 @@ from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
@@ -40,8 +39,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
 __all__ = ['TTSExecutor']
 
 
-@cli_register(
-    name='paddlespeech.tts', description='Text to Speech infer command.')
 class TTSExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
index e7b499f72..128767e62 100644
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -41,6 +41,7 @@ requests.adapters.DEFAULT_RETRIES = 3
 __all__ = [
     'timer_register',
     'cli_register',
+    'explicit_command_register',
     'get_command',
     'download_and_decompress',
     'load_state_dict_from_url',
@@ -70,6 +71,16 @@ def cli_register(name: str, description: str='') -> Any:
     return _warpper
 
 
+def explicit_command_register(name: str, description: str='', cls: str=''):
+    items = name.split('.')
+    com = commands
+    for item in items:
+        com = com[item]
+    com['_entry'] = cls
+    if description:
+        com['_description'] = description
+
+
 def get_command(name: str) -> Any:
     items = name.split('.')
     com = commands
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index cc664369f..07fb73a4c 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -28,7 +28,6 @@ from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
@@ -37,9 +36,6 @@ from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 
 
-@cli_register(
-    name="paddlespeech.vector",
-    description="Speech to vector embedding infer command.")
 class VectorExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
@@ -476,4 +472,4 @@ class VectorExecutor(BaseExecutor):
         else:
             logger.info("The audio file format is right")
 
-        return True
\ No newline at end of file
+        return True

From 27a5de1af7852a70526673495250cf3ae0bc6b86 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 26 May 2022 10:35:08 +0800
Subject: [PATCH 10/40] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index a43e21bd2..c9d4796c8 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,8 @@
   | <a href="#documents"> Documents </a>
   | <a href="#model-list"> Models List </a>
   | <a href="https://aistudio.baidu.com/aistudio/education/group/info/25130"> AIStudio Courses </a>
+  | <a href="https://arxiv.org/abs/2205.12007"> Paper </a>
+  | <a href="https://gitee.com/paddlepaddle/PaddleSpeech"> Gitee </a>
 </h4>
 </div>
 

From fe3474729de6dd0720dd1f848eb92a480f485843 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 26 May 2022 10:36:05 +0800
Subject: [PATCH 11/40] Update README_cn.md

---
 README_cn.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README_cn.md b/README_cn.md
index ed5c6a90d..c751b061d 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -25,6 +25,8 @@
   | <a href="#教程文档"> 教程文档 </a>
   | <a href="#模型列表"> 模型列表 </a>
   | <a href="https://aistudio.baidu.com/aistudio/education/group/info/25130"> AIStudio 课程 </a>
+  | <a href="https://arxiv.org/abs/2205.12007"> 论文 </a>
+  | <a href="https://gitee.com/paddlepaddle/PaddleSpeech"> Gitee 
 </h4>
 </div>
 

From 780da806d75f8e07ba62ec47e16a2b5cfa636ac7 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 26 May 2022 03:46:01 +0000
Subject: [PATCH 12/40] fix test_cli, test=doc

---
 tests/unit/cli/test_cli.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index e1f1853f6..e0ebd1412 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -25,7 +25,7 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w
 # long audio restriction
 {
 wget -c https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
-paddlespeech asr --input test_long_audio_01.wav
+paddlespeech asr --model deepspeech2online_wenetspeech --input test_long_audio_01.wav -y
 if [ $? -ne 255 ]; then
    echo -e "\e[1;31mTime restriction not passed\e[0m"
    exit 1
@@ -54,7 +54,7 @@ paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav
 
-# Speaker Verification 
+# Speaker Verification
 wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 paddlespeech vector --task spk --input 85236145389.wav
 
@@ -65,7 +65,7 @@ echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
 paddlespeech vector --task spk --input vec.job
 
 echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector --task spk
-rm 85236145389.wav 
+rm 85236145389.wav
 rm vec.job
 
 # shell pipeline

From 49dadc8044ace30a12782775dc1a8c659a5b30e7 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 26 May 2022 13:32:26 +0800
Subject: [PATCH 13/40] Update usage and doc of cli executor.

---
 demos/audio_searching/src/encode.py           | 2 +-
 demos/audio_tagging/README.md                 | 2 +-
 demos/audio_tagging/README_cn.md              | 2 +-
 demos/automatic_video_subtitiles/README.md    | 3 ++-
 demos/automatic_video_subtitiles/README_cn.md | 3 ++-
 demos/automatic_video_subtitiles/recognize.py | 4 ++--
 demos/punctuation_restoration/README.md       | 2 +-
 demos/punctuation_restoration/README_cn.md    | 2 +-
 demos/speaker_verification/README.md          | 2 +-
 demos/speaker_verification/README_cn.md       | 2 +-
 demos/speech_recognition/README.md            | 2 +-
 demos/speech_recognition/README_cn.md         | 2 +-
 demos/speech_translation/README.md            | 2 +-
 demos/speech_translation/README_cn.md         | 2 +-
 demos/text_to_speech/README.md                | 2 +-
 demos/text_to_speech/README_cn.md             | 2 +-
 16 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py
index c89a11c1f..f6bcb00ad 100644
--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@@ -14,7 +14,7 @@
 import numpy as np
 from logs import LOGGER
 
-from paddlespeech.cli import VectorExecutor
+from paddlespeech.cli.vector import VectorExecutor
 
 vector_executor = VectorExecutor()
 
diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md
index 9d4af0be6..fc4a334ea 100644
--- a/demos/audio_tagging/README.md
+++ b/demos/audio_tagging/README.md
@@ -57,7 +57,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespe
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import CLSExecutor
+  from paddlespeech.cli.cls import CLSExecutor
 
   cls_executor = CLSExecutor()
   result = cls_executor(
diff --git a/demos/audio_tagging/README_cn.md b/demos/audio_tagging/README_cn.md
index 79f87bf8c..36b5d8aaf 100644
--- a/demos/audio_tagging/README_cn.md
+++ b/demos/audio_tagging/README_cn.md
@@ -57,7 +57,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespe
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import CLSExecutor
+  from paddlespeech.cli.cls import CLSExecutor
 
   cls_executor = CLSExecutor()
   result = cls_executor(
diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md
index db6da40db..b815425ec 100644
--- a/demos/automatic_video_subtitiles/README.md
+++ b/demos/automatic_video_subtitiles/README.md
@@ -28,7 +28,8 @@ ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import ASRExecutor, TextExecutor
+  from paddlespeech.cli.asr import ASRExecutor
+  from paddlespeech.cli.text import TextExecutor
 
   asr_executor = ASRExecutor()
   text_executor = TextExecutor()
diff --git a/demos/automatic_video_subtitiles/README_cn.md b/demos/automatic_video_subtitiles/README_cn.md
index fc7b2cf6a..990ff6dbd 100644
--- a/demos/automatic_video_subtitiles/README_cn.md
+++ b/demos/automatic_video_subtitiles/README_cn.md
@@ -23,7 +23,8 @@ ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import ASRExecutor, TextExecutor
+  from paddlespeech.cli.asr import ASRExecutor
+  from paddlespeech.cli.text import TextExecutor
 
   asr_executor = ASRExecutor()
   text_executor = TextExecutor()
diff --git a/demos/automatic_video_subtitiles/recognize.py b/demos/automatic_video_subtitiles/recognize.py
index 72e3c3a85..304599d19 100644
--- a/demos/automatic_video_subtitiles/recognize.py
+++ b/demos/automatic_video_subtitiles/recognize.py
@@ -16,8 +16,8 @@ import os
 
 import paddle
 
-from paddlespeech.cli import ASRExecutor
-from paddlespeech.cli import TextExecutor
+from paddlespeech.cli.asr import ASRExecutor
+from paddlespeech.cli.text import TextExecutor
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md
index 518d437dc..458ab92f9 100644
--- a/demos/punctuation_restoration/README.md
+++ b/demos/punctuation_restoration/README.md
@@ -42,7 +42,7 @@ The input of this demo should be a text of the specific language that can be pas
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import TextExecutor
+  from paddlespeech.cli.text import TextExecutor
 
   text_executor = TextExecutor()
   result = text_executor(
diff --git a/demos/punctuation_restoration/README_cn.md b/demos/punctuation_restoration/README_cn.md
index 9d4be8bf0..f25acdadb 100644
--- a/demos/punctuation_restoration/README_cn.md
+++ b/demos/punctuation_restoration/README_cn.md
@@ -44,7 +44,7 @@
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import TextExecutor
+  from paddlespeech.cli.text import TextExecutor
 
   text_executor = TextExecutor()
   result = text_executor(
diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md
index 63dc9294e..900b5ae40 100644
--- a/demos/speaker_verification/README.md
+++ b/demos/speaker_verification/README.md
@@ -96,7 +96,7 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 
 - Python API
   ```python
-  from paddlespeech.cli import VectorExecutor
+  from paddlespeech.cli.vector import VectorExecutor
 
   vector_executor = VectorExecutor()
   audio_emb = vector_executor(
diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md
index 07eeac2ee..f6afa86ac 100644
--- a/demos/speaker_verification/README_cn.md
+++ b/demos/speaker_verification/README_cn.md
@@ -95,7 +95,7 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import VectorExecutor
+  from paddlespeech.cli.vector import VectorExecutor
 
   vector_executor = VectorExecutor()
   audio_emb = vector_executor(
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index 6493e8e61..c815a88af 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -58,7 +58,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import ASRExecutor
+  from paddlespeech.cli.asr import ASRExecutor
 
   asr_executor = ASRExecutor()
   text = asr_executor(
diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 8d631d89c..13aa9f277 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -56,7 +56,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import ASRExecutor
+  from paddlespeech.cli.asr import ASRExecutor
 
   asr_executor = ASRExecutor()
   text = asr_executor(
diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md
index f675a4eda..00a9c7932 100644
--- a/demos/speech_translation/README.md
+++ b/demos/speech_translation/README.md
@@ -47,7 +47,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import STExecutor
+  from paddlespeech.cli.st import STExecutor
 
   st_executor = STExecutor()
   text = st_executor(
diff --git a/demos/speech_translation/README_cn.md b/demos/speech_translation/README_cn.md
index bad9b392f..5119bf9f4 100644
--- a/demos/speech_translation/README_cn.md
+++ b/demos/speech_translation/README_cn.md
@@ -47,7 +47,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import STExecutor
+  from paddlespeech.cli.st import STExecutor
   
   st_executor = STExecutor()
   text = st_executor(
diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md
index 2df72a82d..389847a12 100644
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@@ -77,7 +77,7 @@ The input of this demo should be a text of the specific language that can be pas
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import TTSExecutor
+  from paddlespeech.cli.tts import TTSExecutor
 
   tts_executor = TTSExecutor()
   wav_file = tts_executor(
diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md
index 7e02b9624..f967d3d4d 100644
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@@ -80,7 +80,7 @@
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import TTSExecutor
+  from paddlespeech.cli.tts import TTSExecutor
 
   tts_executor = TTSExecutor()
   wav_file = tts_executor(

From 418cc37ffb43773d24c486069a3b7e346bd8e5ae Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Thu, 26 May 2022 22:47:30 +0800
Subject: [PATCH 14/40] refactor file org & rename binary files

---
 speechx/examples/codelab/README.md            |   8 +
 speechx/examples/codelab/decoder/.gitignore   |   2 +
 speechx/examples/codelab/decoder/README.md    |  12 ++
 speechx/examples/codelab/decoder/path.sh      |  14 ++
 speechx/examples/codelab/decoder/run.sh       |  78 +++++++
 speechx/examples/codelab/decoder/valgrind.sh  |  26 +++
 speechx/examples/codelab/feat/README.md       |   7 +
 speechx/examples/codelab/feat/path.sh         |  14 ++
 speechx/examples/codelab/feat/run.sh          |  57 +++++
 speechx/examples/codelab/feat/valgrind.sh     |  24 +++
 speechx/examples/codelab/nnet/.gitignore      |   2 +
 speechx/examples/codelab/nnet/README.md       |   3 +
 speechx/examples/codelab/nnet/path.sh         |  14 ++
 speechx/examples/codelab/nnet/run.sh          |  29 +++
 speechx/examples/codelab/nnet/valgrind.sh     |  21 ++
 speechx/examples/custom_asr/run.sh            |   8 +-
 .../ds2_ol/aishell/local/text_to_lexicon.py   |  37 ----
 speechx/examples/ds2_ol/aishell/path.sh       |   8 +-
 speechx/examples/ds2_ol/aishell/run.sh        |  12 +-
 speechx/examples/ds2_ol/aishell/run_fbank.sh  |  12 +-
 .../examples/ds2_ol/websocket/CMakeLists.txt  |   9 -
 speechx/examples/ds2_ol/websocket/path.sh     |   8 +-
 speechx/speechx/CMakeLists.txt                |   6 +
 .../codelab}/CMakeLists.txt                   |   4 +-
 speechx/speechx/codelab/README.md             |   7 +
 speechx/speechx/codelab/glog/CMakeLists.txt   |   8 +
 speechx/speechx/codelab/glog/README.md        |  38 ++++
 .../codelab/glog/glog_logtostderr_main.cc     |  25 +++
 speechx/speechx/codelab/glog/glog_main.cc     |  23 ++
 speechx/speechx/codelab/nnet/CMakeLists.txt   |   6 +
 .../codelab/nnet/ds2_model_test_main.cc       | 203 ++++++++++++++++++
 speechx/speechx/decoder/CMakeLists.txt        |  13 ++
 .../ctc_prefix_beam_search_decoder_main.cc    | 167 ++++++++++++++
 .../decoder/nnet_logprob_decoder_main.cc      |  74 +++++++
 speechx/speechx/decoder/recognizer_main.cc    |  99 +++++++++
 speechx/speechx/decoder/tlg_decoder_main.cc   | 169 +++++++++++++++
 speechx/speechx/frontend/audio/CMakeLists.txt |  15 +-
 .../frontend/audio/cmvn_json2kaldi_main.cc    |  85 ++++++++
 .../frontend/audio/compute_fbank_main.cc      | 143 ++++++++++++
 .../audio/compute_linear_spectrogram_main.cc  | 145 +++++++++++++
 speechx/speechx/websocket/CMakeLists.txt      |   8 +
 .../websocket/websocket_client_main.cc        |   0
 .../websocket/websocket_server_main.cc        |   0
 43 files changed, 1570 insertions(+), 73 deletions(-)
 create mode 100644 speechx/examples/codelab/README.md
 create mode 100644 speechx/examples/codelab/decoder/.gitignore
 create mode 100644 speechx/examples/codelab/decoder/README.md
 create mode 100644 speechx/examples/codelab/decoder/path.sh
 create mode 100755 speechx/examples/codelab/decoder/run.sh
 create mode 100755 speechx/examples/codelab/decoder/valgrind.sh
 create mode 100644 speechx/examples/codelab/feat/README.md
 create mode 100644 speechx/examples/codelab/feat/path.sh
 create mode 100755 speechx/examples/codelab/feat/run.sh
 create mode 100755 speechx/examples/codelab/feat/valgrind.sh
 create mode 100644 speechx/examples/codelab/nnet/.gitignore
 create mode 100644 speechx/examples/codelab/nnet/README.md
 create mode 100644 speechx/examples/codelab/nnet/path.sh
 create mode 100755 speechx/examples/codelab/nnet/run.sh
 create mode 100755 speechx/examples/codelab/nnet/valgrind.sh
 delete mode 100755 speechx/examples/ds2_ol/aishell/local/text_to_lexicon.py
 delete mode 100644 speechx/examples/ds2_ol/websocket/CMakeLists.txt
 rename speechx/{examples => speechx/codelab}/CMakeLists.txt (52%)
 create mode 100644 speechx/speechx/codelab/README.md
 create mode 100644 speechx/speechx/codelab/glog/CMakeLists.txt
 create mode 100644 speechx/speechx/codelab/glog/README.md
 create mode 100644 speechx/speechx/codelab/glog/glog_logtostderr_main.cc
 create mode 100644 speechx/speechx/codelab/glog/glog_main.cc
 create mode 100644 speechx/speechx/codelab/nnet/CMakeLists.txt
 create mode 100644 speechx/speechx/codelab/nnet/ds2_model_test_main.cc
 create mode 100644 speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
 create mode 100644 speechx/speechx/decoder/nnet_logprob_decoder_main.cc
 create mode 100644 speechx/speechx/decoder/recognizer_main.cc
 create mode 100644 speechx/speechx/decoder/tlg_decoder_main.cc
 create mode 100644 speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
 create mode 100644 speechx/speechx/frontend/audio/compute_fbank_main.cc
 create mode 100644 speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
 rename speechx/{examples/ds2_ol => speechx}/websocket/websocket_client_main.cc (100%)
 rename speechx/{examples/ds2_ol => speechx}/websocket/websocket_server_main.cc (100%)

diff --git a/speechx/examples/codelab/README.md b/speechx/examples/codelab/README.md
new file mode 100644
index 000000000..f89184de9
--- /dev/null
+++ b/speechx/examples/codelab/README.md
@@ -0,0 +1,8 @@
+# Codelab
+
+## introduction
+
+> The below is for developing and offline testing. Do not run it only if you know what it is.
+* nnet
+* feat
+* decoder
diff --git a/speechx/examples/codelab/decoder/.gitignore b/speechx/examples/codelab/decoder/.gitignore
new file mode 100644
index 000000000..bbd86a25b
--- /dev/null
+++ b/speechx/examples/codelab/decoder/.gitignore
@@ -0,0 +1,2 @@
+data
+exp
diff --git a/speechx/examples/codelab/decoder/README.md b/speechx/examples/codelab/decoder/README.md
new file mode 100644
index 000000000..ead3b8e13
--- /dev/null
+++ b/speechx/examples/codelab/decoder/README.md
@@ -0,0 +1,12 @@
+# ASR Decoder
+
+ASR Decoder test bins. We using theses bins to test CTC BeamSearch decoder and WFST decoder.
+
+* decoder_test_main.cc 
+feed nnet output logprob, and only test decoder
+
+* offline_decoder_sliding_chunk_main.cc
+feed streaming audio feature, decode as streaming manner.
+
+* offline_wfst_decoder_main.cc
+feed streaming audio feature, decode using WFST as streaming manner.
diff --git a/speechx/examples/codelab/decoder/path.sh b/speechx/examples/codelab/decoder/path.sh
new file mode 100644
index 000000000..9d2291743
--- /dev/null
+++ b/speechx/examples/codelab/decoder/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_ROOT/build/speechx/decoder:$SPEECHX_ROOT/build/speechx/frontend/audio
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/codelab/decoder/run.sh b/speechx/examples/codelab/decoder/run.sh
new file mode 100755
index 000000000..a911eb033
--- /dev/null
+++ b/speechx/examples/codelab/decoder/run.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+set +x
+set -e
+
+. path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# input
+mkdir -p data
+data=$PWD/data
+ckpt_dir=$data/model
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+vocb_dir=$ckpt_dir/data/lang_char/
+
+lm=$data/zh_giga.no_cna_cmn.prune01244.klm
+
+# output
+exp_dir=./exp
+mkdir -p $exp_dir
+
+# 2. download model
+if [[ ! -f data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]]; then
+    mkdir -p data/model
+    pushd data/model
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    popd
+fi
+
+# produce wav scp
+if [ ! -f data/wav.scp ]; then
+    pushd data
+    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
+    echo "utt1 " $PWD/zh.wav > wav.scp
+    popd 
+fi
+
+# download lm
+if [ ! -f $lm ]; then
+    pushd data
+    wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
+    popd
+fi
+
+feat_wspecifier=$exp_dir/feats.ark
+cmvn=$exp_dir/cmvn.ark
+
+export GLOG_logtostderr=1
+
+# dump json cmvn to kaldi
+cmvn_json2kaldi_main \
+    --json_file  $ckpt_dir/data/mean_std.json \
+    --cmvn_write_path $cmvn \
+    --binary=false
+echo "convert json cmvn to kaldi ark."
+
+
+# generate linear feature as streaming
+compute_linear_spectrogram_main \
+    --wav_rspecifier=scp:$data/wav.scp \
+    --feature_wspecifier=ark,t:$feat_wspecifier \
+    --cmvn_file=$cmvn
+echo "compute linear spectrogram feature."
+
+# run ctc beam search decoder as streaming
+ctc_prefix_beam_search_decoder_main \
+    --result_wspecifier=ark,t:$exp_dir/result.txt \
+    --feature_rspecifier=ark:$feat_wspecifier \
+    --model_path=$model_dir/avg_1.jit.pdmodel \
+    --param_path=$model_dir/avg_1.jit.pdiparams \
+    --dict_file=$vocb_dir/vocab.txt \
+    --lm_path=$lm
diff --git a/speechx/examples/codelab/decoder/valgrind.sh b/speechx/examples/codelab/decoder/valgrind.sh
new file mode 100755
index 000000000..14efe0ba4
--- /dev/null
+++ b/speechx/examples/codelab/decoder/valgrind.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# this script is for memory check, so please run ./run.sh first.
+
+set +x
+set -e
+
+. ./path.sh
+
+if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
+  echo "please install valgrind in the speechx tools dir.\n" 
+  exit 1
+fi
+
+model_dir=../paddle_asr_model
+feat_wspecifier=./feats.ark
+cmvn=./cmvn.ark
+
+valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
+  offline_decoder_main \
+  --feature_respecifier=ark:$feat_wspecifier \
+  --model_path=$model_dir/avg_1.jit.pdmodel \
+  --param_path=$model_dir/avg_1.jit.pdparams \
+  --dict_file=$model_dir/vocab.txt \
+  --lm_path=$model_dir/avg_1.jit.klm
+
diff --git a/speechx/examples/codelab/feat/README.md b/speechx/examples/codelab/feat/README.md
new file mode 100644
index 000000000..e59e02bf9
--- /dev/null
+++ b/speechx/examples/codelab/feat/README.md
@@ -0,0 +1,7 @@
+# Deepspeech2 Straming Audio Feature
+
+ASR audio feature test bins. We using theses bins to test linaer/fbank/mfcc asr feature as streaming manner.
+
+* compute_linear_spectrogram_main.cc
+
+compute linear spectrogram without db norm in streaming manner.
diff --git a/speechx/examples/codelab/feat/path.sh b/speechx/examples/codelab/feat/path.sh
new file mode 100644
index 000000000..3b89d01e9
--- /dev/null
+++ b/speechx/examples/codelab/feat/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_ROOT/build/speechx/decoder:$SPEECHX_ROOT/build/speechx/frontend/audio
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/codelab/feat/run.sh b/speechx/examples/codelab/feat/run.sh
new file mode 100755
index 000000000..1fa37f981
--- /dev/null
+++ b/speechx/examples/codelab/feat/run.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+set +x
+set -e
+
+. ./path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# 2. download model
+if [ ! -e data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]; then
+    mkdir -p data/model
+    pushd data/model
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    popd
+fi
+
+# produce wav scp
+if [ ! -f data/wav.scp ]; then
+    mkdir -p data
+    pushd data
+    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
+    echo "utt1 " $PWD/zh.wav > wav.scp
+    popd 
+fi
+
+
+# input
+data_dir=./data
+exp_dir=./exp
+model_dir=$data_dir/model/
+
+mkdir -p $exp_dir
+
+
+# 3. run feat
+export GLOG_logtostderr=1
+
+cmvn_json2kaldi_main \
+    --json_file  $model_dir/data/mean_std.json \
+    --cmvn_write_path $exp_dir/cmvn.ark \
+    --binary=false
+echo "convert json cmvn to kaldi ark."
+
+
+compute_linear_spectrogram_main \
+    --wav_rspecifier=scp:$data_dir/wav.scp \
+    --feature_wspecifier=ark,t:$exp_dir/feats.ark \
+    --cmvn_file=$exp_dir/cmvn.ark
+echo "compute linear spectrogram feature."
+
+
diff --git a/speechx/examples/codelab/feat/valgrind.sh b/speechx/examples/codelab/feat/valgrind.sh
new file mode 100755
index 000000000..ea50fdc23
--- /dev/null
+++ b/speechx/examples/codelab/feat/valgrind.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# this script is for memory check, so please run ./run.sh first.
+
+set +x
+set -e
+
+. ./path.sh
+
+if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
+  echo "please install valgrind in the speechx tools dir.\n" 
+  exit 1
+fi
+
+model_dir=../paddle_asr_model
+feat_wspecifier=./feats.ark
+cmvn=./cmvn.ark
+
+valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
+  compute_linear_spectrogram_main \
+  --wav_rspecifier=scp:$model_dir/wav.scp \
+  --feature_wspecifier=ark,t:$feat_wspecifier \
+  --cmvn_write_path=$cmvn
+
diff --git a/speechx/examples/codelab/nnet/.gitignore b/speechx/examples/codelab/nnet/.gitignore
new file mode 100644
index 000000000..bbd86a25b
--- /dev/null
+++ b/speechx/examples/codelab/nnet/.gitignore
@@ -0,0 +1,2 @@
+data
+exp
diff --git a/speechx/examples/codelab/nnet/README.md b/speechx/examples/codelab/nnet/README.md
new file mode 100644
index 000000000..772a58f0e
--- /dev/null
+++ b/speechx/examples/codelab/nnet/README.md
@@ -0,0 +1,3 @@
+# Deepspeech2 Streaming NNet Test
+
+Using for ds2 streaming nnet inference test.
diff --git a/speechx/examples/codelab/nnet/path.sh b/speechx/examples/codelab/nnet/path.sh
new file mode 100644
index 000000000..7d395d648
--- /dev/null
+++ b/speechx/examples/codelab/nnet/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_BUILD/codelab/nnet
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/codelab/nnet/run.sh b/speechx/examples/codelab/nnet/run.sh
new file mode 100755
index 000000000..842499ba2
--- /dev/null
+++ b/speechx/examples/codelab/nnet/run.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+set +x
+set -e
+
+. path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# 2. download model
+if [ ! -f data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]; then
+    mkdir -p data/model
+    pushd data/model
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+    popd
+fi
+
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+
+ds2_model_test_main \
+    --model_path=$model_dir/avg_1.jit.pdmodel \
+    --param_path=$model_dir/avg_1.jit.pdiparams
+
diff --git a/speechx/examples/codelab/nnet/valgrind.sh b/speechx/examples/codelab/nnet/valgrind.sh
new file mode 100755
index 000000000..a5aab6637
--- /dev/null
+++ b/speechx/examples/codelab/nnet/valgrind.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# this script is for memory check, so please run ./run.sh first.
+
+set +x
+set -e
+
+. ./path.sh
+
+if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
+  echo "please install valgrind in the speechx tools dir.\n" 
+  exit 1
+fi
+
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+
+valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
+  ds2_model_test_main \
+  --model_path=$model_dir/avg_1.jit.pdmodel \
+  --param_path=$model_dir/avg_1.jit.pdparams
diff --git a/speechx/examples/custom_asr/run.sh b/speechx/examples/custom_asr/run.sh
index 8d88000dc..dddcf9fd1 100644
--- a/speechx/examples/custom_asr/run.sh
+++ b/speechx/examples/custom_asr/run.sh
@@ -7,7 +7,7 @@ export GLOG_logtostderr=1
 . ./path.sh || exit 1;
 
 # ds2 means deepspeech2 (acoutic model type)
-dir=$PWD/ds2_graph_with_slot
+dir=$PWD/exp/ds2_graph_with_slot
 data=$PWD/data
 stage=0
 stop_stage=10
@@ -80,9 +80,9 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     --word_symbol_table=$graph/words.txt \
     --graph_path=$graph/TLG.fst --max_active=7500 \
     --acoustic_scale=12 \
-    --result_wspecifier=ark,t:./result_run.txt
+    --result_wspecifier=ark,t:./exp/result_run.txt
 
     # the data/wav.trans is the label.
-    utils/compute-wer.py --char=1 --v=1 data/wav.trans result_run.txt > wer_run
-    tail -n 7 wer_run
+    utils/compute-wer.py --char=1 --v=1 data/wav.trans exp/result_run.txt > exp/wer_run
+    tail -n 7 exp/wer_run
 fi
diff --git a/speechx/examples/ds2_ol/aishell/local/text_to_lexicon.py b/speechx/examples/ds2_ol/aishell/local/text_to_lexicon.py
deleted file mode 100755
index ba5ab60ac..000000000
--- a/speechx/examples/ds2_ol/aishell/local/text_to_lexicon.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-from collections import Counter
-
-
-def main(args):
-    counter = Counter()
-    with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
-        for line in fin:
-            line = line.strip()
-            if args.has_key:
-                utt, text = line.split(maxsplit=1)
-                words = text.split()
-            else:
-                words = line.split()
-
-            counter.update(words)
-
-        for word in counter:
-            val = " ".join(list(word))
-            fout.write(f"{word}\t{val}\n")
-            fout.flush()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='text(line:utt1 中国 人) to lexicon（line:中国 中 国).')
-    parser.add_argument(
-        '--has_key', default=True, help='text path, with utt or not')
-    parser.add_argument(
-        '--text', required=True, help='text path. line: utt1 中国 人 or 中国 人')
-    parser.add_argument(
-        '--lexicon', required=True, help='lexicon path. line:中国 中 国')
-    args = parser.parse_args()
-    print(args)
-
-    main(args)
diff --git a/speechx/examples/ds2_ol/aishell/path.sh b/speechx/examples/ds2_ol/aishell/path.sh
index 1807a277a..69c78e746 100755
--- a/speechx/examples/ds2_ol/aishell/path.sh
+++ b/speechx/examples/ds2_ol/aishell/path.sh
@@ -1,13 +1,13 @@
 # This contains the locations of binarys build required for running the examples.
 
 MAIN_ROOT=`realpath $PWD/../../../../`
-SPEECHX_ROOT=$PWD/../../..
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
 
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
 
 export LC_AL=C
 
@@ -20,5 +20,5 @@ export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
 export SRILM=${MAIN_ROOT}/tools/srilm
 
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat:$SPEECHX_EXAMPLES/ds2_ol/websocket
+SPEECHX_BIN=$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/websocket
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN:${SRILM}/bin:${SRILM}/bin/i686-m64:$KALDI_DIR/lmbin:$KALDI_DIR/fstbin:$OPENFST_DIR/bin
diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index 650cb1409..e1001e250 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -69,12 +69,12 @@ export GLOG_logtostderr=1
 cmvn=$data/cmvn.ark
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # 3. gen linear feat
-    cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+    cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
 
     ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
 
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
-    linear-spectrogram-wo-db-norm-ol \
+    compute_linear_spectrogram_main \
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
         --cmvn_file=$cmvn \
@@ -85,7 +85,7 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     #  recognizer
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
-    ctc-prefix-beam-search-decoder-ol \
+    ctc_prefix_beam_search_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
         --model_path=$model_dir/avg_1.jit.pdmodel \
         --param_path=$model_dir/avg_1.jit.pdiparams \
@@ -102,7 +102,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     #  decode with lm
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
-    ctc-prefix-beam-search-decoder-ol \
+    ctc_prefix_beam_search_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
         --model_path=$model_dir/avg_1.jit.pdmodel \
         --param_path=$model_dir/avg_1.jit.pdiparams \
@@ -132,7 +132,7 @@ fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     #  TLG decoder
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
-    wfst-decoder-ol \
+    tlg_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
         --model_path=$model_dir/avg_1.jit.pdmodel \
         --param_path=$model_dir/avg_1.jit.pdiparams \
@@ -151,7 +151,7 @@ fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     #  TLG decoder
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \
-    recognizer_test_main \
+    recognizer_main \
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --cmvn_file=$cmvn \
         --model_path=$model_dir/avg_1.jit.pdmodel \
diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
index 483fbfdfe..130f5a8c4 100755
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -69,7 +69,7 @@ export GLOG_logtostderr=1
 cmvn=$data/cmvn_fbank.ark
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # 3. gen linear feat
-    cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn --binary=false
+    cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn --binary=false
 
     ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
 
@@ -84,7 +84,7 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     #  recognizer
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \
-    ctc-prefix-beam-search-decoder-ol \
+    ctc_prefix_beam_search_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
         --model_path=$model_dir/avg_5.jit.pdmodel \
         --param_path=$model_dir/avg_5.jit.pdiparams \
@@ -100,12 +100,12 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     #  decode with lm
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \
-    ctc-prefix-beam-search-decoder-ol \
+    ctc_prefix_beam_search_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
         --model_path=$model_dir/avg_5.jit.pdmodel \
         --param_path=$model_dir/avg_5.jit.pdiparams \
         --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-	--model_cache_shapes="5-1-2048,5-1-2048" \
+	    --model_cache_shapes="5-1-2048,5-1-2048" \
         --dict_file=$vocb_dir/vocab.txt \
         --lm_path=$lm \
         --result_wspecifier=ark,t:$data/split${nj}/JOB/fbank_result_lm
@@ -129,13 +129,13 @@ fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     #  TLG decoder
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \
-    wfst-decoder-ol \
+    tlg_decoder_main \
         --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
         --model_path=$model_dir/avg_5.jit.pdmodel \
         --param_path=$model_dir/avg_5.jit.pdiparams \
         --word_symbol_table=$wfst/words.txt \
         --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-	--model_cache_shapes="5-1-2048,5-1-2048" \
+	    --model_cache_shapes="5-1-2048,5-1-2048" \
         --graph_path=$wfst/TLG.fst --max_active=7500 \
         --acoustic_scale=1.2 \
         --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
diff --git a/speechx/examples/ds2_ol/websocket/CMakeLists.txt b/speechx/examples/ds2_ol/websocket/CMakeLists.txt
deleted file mode 100644
index ed542aad0..000000000
--- a/speechx/examples/ds2_ol/websocket/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_executable(websocket_server_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_server_main.cc)
-target_include_directories(websocket_server_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(websocket_server_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder websocket ${DEPS})
-
-add_executable(websocket_client_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_client_main.cc)
-target_include_directories(websocket_client_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(websocket_client_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder websocket ${DEPS})
\ No newline at end of file
diff --git a/speechx/examples/ds2_ol/websocket/path.sh b/speechx/examples/ds2_ol/websocket/path.sh
index d66b5dcce..3ad032031 100755
--- a/speechx/examples/ds2_ol/websocket/path.sh
+++ b/speechx/examples/ds2_ol/websocket/path.sh
@@ -1,14 +1,14 @@
 # This contains the locations of binarys build required for running the examples.
 
-SPEECHX_ROOT=$PWD/../../..
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+SPEECHX_ROOT=$PWD/../../../
+SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx
 
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; }
 
 export LC_AL=C
 
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/websocket:$SPEECHX_EXAMPLES/ds2_ol/feat
+SPEECHX_BIN=$SPEECHX_BUILD/websocket
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt
index b4da095d8..a9a8a398d 100644
--- a/speechx/speechx/CMakeLists.txt
+++ b/speechx/speechx/CMakeLists.txt
@@ -37,3 +37,9 @@ ${CMAKE_CURRENT_SOURCE_DIR}
 ${CMAKE_CURRENT_SOURCE_DIR}/websocket
 )
 add_subdirectory(websocket)
+
+include_directories(
+${CMAKE_CURRENT_SOURCE_DIR}
+${CMAKE_CURRENT_SOURCE_DIR}/codelab
+)
+add_subdirectory(codelab)
diff --git a/speechx/examples/CMakeLists.txt b/speechx/speechx/codelab/CMakeLists.txt
similarity index 52%
rename from speechx/examples/CMakeLists.txt
rename to speechx/speechx/codelab/CMakeLists.txt
index 3c274a20a..950432637 100644
--- a/speechx/examples/CMakeLists.txt
+++ b/speechx/speechx/codelab/CMakeLists.txt
@@ -1,4 +1,4 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 
-add_subdirectory(ds2_ol)
-add_subdirectory(dev)
\ No newline at end of file
+add_subdirectory(glog)
+add_subdirectory(nnet)
diff --git a/speechx/speechx/codelab/README.md b/speechx/speechx/codelab/README.md
new file mode 100644
index 000000000..aee60de67
--- /dev/null
+++ b/speechx/speechx/codelab/README.md
@@ -0,0 +1,7 @@
+
+## For Developer  
+
+> Reminder: Only for developer.
+
+* codelab - for speechx developer, using for test.
+
diff --git a/speechx/speechx/codelab/glog/CMakeLists.txt b/speechx/speechx/codelab/glog/CMakeLists.txt
new file mode 100644
index 000000000..08a98641f
--- /dev/null
+++ b/speechx/speechx/codelab/glog/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_executable(glog_main ${CMAKE_CURRENT_SOURCE_DIR}/glog_main.cc)
+target_link_libraries(glog_main glog)
+
+
+add_executable(glog_logtostderr_main ${CMAKE_CURRENT_SOURCE_DIR}/glog_logtostderr_main.cc)
+target_link_libraries(glog_logtostderr_main glog)
diff --git a/speechx/speechx/codelab/glog/README.md b/speechx/speechx/codelab/glog/README.md
new file mode 100644
index 000000000..3282c920d
--- /dev/null
+++ b/speechx/speechx/codelab/glog/README.md
@@ -0,0 +1,38 @@
+# [GLOG](https://rpg.ifi.uzh.ch/docs/glog.html)
+
+Unless otherwise specified, glog writes to the filename `/tmp/<program name>.<hostname>.<user name>.log.<severity level>.<date>.<time>.<pid>` (e.g., "/tmp/hello_world.example.com.hamaji.log.INFO.20080709-222411.10474"). By default, glog copies the log messages of severity level ERROR or FATAL to standard error (stderr) in addition to log files.
+
+Several flags influence glog's output behavior. If the Google gflags library is installed on your machine, the configure script (see the INSTALL file in the package for detail of this script) will automatically detect and use it, allowing you to pass flags on the command line. For example, if you want to turn the flag --logtostderr on, you can start your application with the following command line:
+
+   `./your_application --logtostderr=1`
+
+If the Google gflags library isn't installed, you set flags via environment variables, prefixing the flag name with "GLOG_", e.g.
+
+   `GLOG_logtostderr=1 ./your_application`
+
+You can also modify flag values in your program by modifying global variables `FLAGS_*` . Most settings start working immediately after you update `FLAGS_*` . The exceptions are the flags related to destination files. For example, you might want to set `FLAGS_log_dir` before calling `google::InitGoogleLogging` . Here is an example:
+∂∂
+```c++
+   LOG(INFO) << "file";
+   // Most flags work immediately after updating values.
+   FLAGS_logtostderr = 1;
+   LOG(INFO) << "stderr";
+   FLAGS_logtostderr = 0;
+   // This won't change the log destination. If you want to set this
+   // value, you should do this before google::InitGoogleLogging .
+   FLAGS_log_dir = "/some/log/directory";
+   LOG(INFO) << "the same file";
+```
+
+* this is the test script:
+```
+# run 
+glog_test
+
+echo "------"
+export FLAGS_logtostderr=1 
+glog_test
+
+echo "------"
+glog_logtostderr_test
+```
diff --git a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc
new file mode 100644
index 000000000..b0616a7de
--- /dev/null
+++ b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+int main(int argc, char* argv[]) {
+    // Initialize Google’s logging library.
+    google::InitGoogleLogging(argv[0]);
+
+    FLAGS_logtostderr = 1;
+
+    LOG(INFO) << "Found " << 10 << " cookies";
+    LOG(ERROR) << "Found " << 10 << " error";
+}
\ No newline at end of file
diff --git a/speechx/speechx/codelab/glog/glog_main.cc b/speechx/speechx/codelab/glog/glog_main.cc
new file mode 100644
index 000000000..b6275119e
--- /dev/null
+++ b/speechx/speechx/codelab/glog/glog_main.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+int main(int argc, char* argv[]) {
+    // Initialize Google’s logging library.
+    google::InitGoogleLogging(argv[0]);
+
+    LOG(INFO) << "Found " << 10 << " cookies";
+    LOG(ERROR) << "Found " << 10 << " error";
+}
\ No newline at end of file
diff --git a/speechx/speechx/codelab/nnet/CMakeLists.txt b/speechx/speechx/codelab/nnet/CMakeLists.txt
new file mode 100644
index 000000000..dcad8a9c6
--- /dev/null
+++ b/speechx/speechx/codelab/nnet/CMakeLists.txt
@@ -0,0 +1,6 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+set(bin_name ds2_model_test_main)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} PUBLIC nnet gflags glog ${DEPS})
diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
new file mode 100644
index 000000000..283466dc1
--- /dev/null
+++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// deepspeech2 online model info
+
+#include <algorithm>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <thread>
+#include "base/flags.h"
+#include "base/log.h"
+#include "paddle_inference_api.h"
+
+using std::cout;
+using std::endl;
+
+
+DEFINE_string(model_path, "", "xxx.pdmodel");
+DEFINE_string(param_path, "", "xxx.pdiparams");
+DEFINE_int32(chunk_size, 35, "feature chunk size, unit:frame");
+DEFINE_int32(feat_dim, 161, "feature dim");
+
+
+void produce_data(std::vector<std::vector<float>>* data);
+void model_forward_test();
+
+void produce_data(std::vector<std::vector<float>>* data) {
+    int chunk_size = FLAGS_chunk_size;  // chunk_size in frame
+    int col_size = FLAGS_feat_dim;      // feat dim
+    cout << "chunk size: " << chunk_size << endl;
+    cout << "feat dim: " << col_size << endl;
+
+    data->reserve(chunk_size);
+    data->back().reserve(col_size);
+    for (int row = 0; row < chunk_size; ++row) {
+        data->push_back(std::vector<float>());
+        for (int col_idx = 0; col_idx < col_size; ++col_idx) {
+            data->back().push_back(0.201);
+        }
+    }
+}
+
+void model_forward_test() {
+    std::cout << "1. read the data" << std::endl;
+    std::vector<std::vector<float>> feats;
+    produce_data(&feats);
+
+    std::cout << "2. load the model" << std::endl;
+    ;
+    std::string model_graph = FLAGS_model_path;
+    std::string model_params = FLAGS_param_path;
+    CHECK(model_graph != "");
+    CHECK(model_params != "");
+    cout << "model path: " << model_graph << endl;
+    cout << "model param path : " << model_params << endl;
+
+    paddle_infer::Config config;
+    config.SetModel(model_graph, model_params);
+    config.SwitchIrOptim(false);
+    cout << "SwitchIrOptim: " << false << endl;
+    config.DisableFCPadding();
+    cout << "DisableFCPadding: " << endl;
+    auto predictor = paddle_infer::CreatePredictor(config);
+
+    std::cout << "3. feat shape, row=" << feats.size()
+              << ",col=" << feats[0].size() << std::endl;
+    std::vector<float> pp_input_mat;
+    for (const auto& item : feats) {
+        pp_input_mat.insert(pp_input_mat.end(), item.begin(), item.end());
+    }
+
+    std::cout << "4. fead the data to model" << std::endl;
+    int row = feats.size();
+    int col = feats[0].size();
+    std::vector<std::string> input_names = predictor->GetInputNames();
+    std::vector<std::string> output_names = predictor->GetOutputNames();
+    for (auto name : input_names) {
+        cout << "model input names: " << name << endl;
+    }
+    for (auto name : output_names) {
+        cout << "model output names: " << name << endl;
+    }
+
+    // input
+    std::unique_ptr<paddle_infer::Tensor> input_tensor =
+        predictor->GetInputHandle(input_names[0]);
+    std::vector<int> INPUT_SHAPE = {1, row, col};
+    input_tensor->Reshape(INPUT_SHAPE);
+    input_tensor->CopyFromCpu(pp_input_mat.data());
+
+    // input length
+    std::unique_ptr<paddle_infer::Tensor> input_len =
+        predictor->GetInputHandle(input_names[1]);
+    std::vector<int> input_len_size = {1};
+    input_len->Reshape(input_len_size);
+    std::vector<int64_t> audio_len;
+    audio_len.push_back(row);
+    input_len->CopyFromCpu(audio_len.data());
+
+    // state_h
+    std::unique_ptr<paddle_infer::Tensor> chunk_state_h_box =
+        predictor->GetInputHandle(input_names[2]);
+    std::vector<int> chunk_state_h_box_shape = {5, 1, 1024};
+    chunk_state_h_box->Reshape(chunk_state_h_box_shape);
+    int chunk_state_h_box_size =
+        std::accumulate(chunk_state_h_box_shape.begin(),
+                        chunk_state_h_box_shape.end(),
+                        1,
+                        std::multiplies<int>());
+    std::vector<float> chunk_state_h_box_data(chunk_state_h_box_size, 0.0f);
+    chunk_state_h_box->CopyFromCpu(chunk_state_h_box_data.data());
+
+    // state_c
+    std::unique_ptr<paddle_infer::Tensor> chunk_state_c_box =
+        predictor->GetInputHandle(input_names[3]);
+    std::vector<int> chunk_state_c_box_shape = {5, 1, 1024};
+    chunk_state_c_box->Reshape(chunk_state_c_box_shape);
+    int chunk_state_c_box_size =
+        std::accumulate(chunk_state_c_box_shape.begin(),
+                        chunk_state_c_box_shape.end(),
+                        1,
+                        std::multiplies<int>());
+    std::vector<float> chunk_state_c_box_data(chunk_state_c_box_size, 0.0f);
+    chunk_state_c_box->CopyFromCpu(chunk_state_c_box_data.data());
+
+    // run
+    bool success = predictor->Run();
+
+    // state_h out
+    std::unique_ptr<paddle_infer::Tensor> h_out =
+        predictor->GetOutputHandle(output_names[2]);
+    std::vector<int> h_out_shape = h_out->shape();
+    int h_out_size = std::accumulate(
+        h_out_shape.begin(), h_out_shape.end(), 1, std::multiplies<int>());
+    std::vector<float> h_out_data(h_out_size);
+    h_out->CopyToCpu(h_out_data.data());
+
+    // stage_c out
+    std::unique_ptr<paddle_infer::Tensor> c_out =
+        predictor->GetOutputHandle(output_names[3]);
+    std::vector<int> c_out_shape = c_out->shape();
+    int c_out_size = std::accumulate(
+        c_out_shape.begin(), c_out_shape.end(), 1, std::multiplies<int>());
+    std::vector<float> c_out_data(c_out_size);
+    c_out->CopyToCpu(c_out_data.data());
+
+    // output tensor
+    std::unique_ptr<paddle_infer::Tensor> output_tensor =
+        predictor->GetOutputHandle(output_names[0]);
+    std::vector<int> output_shape = output_tensor->shape();
+    std::vector<float> output_probs;
+    int output_size = std::accumulate(
+        output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+    output_probs.resize(output_size);
+    output_tensor->CopyToCpu(output_probs.data());
+    row = output_shape[1];
+    col = output_shape[2];
+
+    // probs
+    std::vector<std::vector<float>> probs;
+    probs.reserve(row);
+    for (int i = 0; i < row; i++) {
+        probs.push_back(std::vector<float>());
+        probs.back().reserve(col);
+
+        for (int j = 0; j < col; j++) {
+            probs.back().push_back(output_probs[i * col + j]);
+        }
+    }
+
+    std::vector<std::vector<float>> log_feat = probs;
+    std::cout << "probs, row: " << log_feat.size()
+              << " col: " << log_feat[0].size() << std::endl;
+    for (size_t row_idx = 0; row_idx < log_feat.size(); ++row_idx) {
+        for (size_t col_idx = 0; col_idx < log_feat[row_idx].size();
+             ++col_idx) {
+            std::cout << log_feat[row_idx][col_idx] << " ";
+        }
+        std::cout << std::endl;
+    }
+}
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    model_forward_test();
+    return 0;
+}
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 06bf4020f..1df935112 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -10,3 +10,16 @@ add_library(decoder STATIC
   recognizer.cc
 )
 target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder)
+
+set(BINS 
+  ctc_prefix_beam_search_decoder_main
+  nnet_logprob_decoder_main
+  recognizer_main
+  tlg_decoder_main
+)
+
+foreach(bin_name IN LISTS BINS)
+  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+  target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+endforeach()
diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
new file mode 100644
index 000000000..eaec41b71
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "decoder/ctc_beam_search_decoder.h"
+#include "frontend/audio/data_cache.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+#include "nnet/paddle_nnet.h"
+
+DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
+DEFINE_string(result_wspecifier, "", "test result wspecifier");
+DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
+DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
+DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
+DEFINE_string(lm_path, "", "language model");
+DEFINE_int32(receptive_field_length,
+             7,
+             "receptive field of two CNN(kernel=5) downsampling module.");
+DEFINE_int32(downsampling_rate,
+             4,
+             "two CNN(kernel=5) module downsampling rate.");
+DEFINE_string(
+    model_input_names,
+    "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
+    "model input names");
+DEFINE_string(model_output_names,
+              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
+              "model output names");
+DEFINE_string(model_cache_names,
+              "chunk_state_h_box,chunk_state_c_box",
+              "model cache names");
+DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+// test ds2 online decoder by feeding speech feature
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    CHECK(FLAGS_result_wspecifier != "");
+    CHECK(FLAGS_feature_rspecifier != "");
+
+    kaldi::SequentialBaseFloatMatrixReader feature_reader(
+        FLAGS_feature_rspecifier);
+    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
+    std::string model_path = FLAGS_model_path;
+    std::string model_params = FLAGS_param_path;
+    std::string dict_file = FLAGS_dict_file;
+    std::string lm_path = FLAGS_lm_path;
+    LOG(INFO) << "model path: " << model_path;
+    LOG(INFO) << "model param: " << model_params;
+    LOG(INFO) << "dict path: " << dict_file;
+    LOG(INFO) << "lm path: " << lm_path;
+
+    int32 num_done = 0, num_err = 0;
+
+    ppspeech::CTCBeamSearchOptions opts;
+    opts.dict_file = dict_file;
+    opts.lm_path = lm_path;
+    ppspeech::CTCBeamSearch decoder(opts);
+
+    ppspeech::ModelOptions model_opts;
+    model_opts.model_path = model_path;
+    model_opts.param_path = model_params;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
+    model_opts.input_names = FLAGS_model_input_names;
+    model_opts.output_names = FLAGS_model_output_names;
+    std::shared_ptr<ppspeech::PaddleNnet> nnet(
+        new ppspeech::PaddleNnet(model_opts));
+    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
+    std::shared_ptr<ppspeech::Decodable> decodable(
+        new ppspeech::Decodable(nnet, raw_data));
+
+    int32 chunk_size = FLAGS_receptive_field_length;
+    int32 chunk_stride = FLAGS_downsampling_rate;
+    int32 receptive_field_length = FLAGS_receptive_field_length;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;
+    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
+    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
+    decoder.InitDecoder();
+
+    kaldi::Timer timer;
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+        string utt = feature_reader.Key();
+        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+        raw_data->SetDim(feature.NumCols());
+        LOG(INFO) << "process utt: " << utt;
+        LOG(INFO) << "rows: " << feature.NumRows();
+        LOG(INFO) << "cols: " << feature.NumCols();
+
+        int32 row_idx = 0;
+        int32 padding_len = 0;
+        int32 ori_feature_len = feature.NumRows();
+        if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
+            padding_len =
+                chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride;
+            feature.Resize(feature.NumRows() + padding_len,
+                           feature.NumCols(),
+                           kaldi::kCopyData);
+        }
+        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
+        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
+                                                          feature.NumCols());
+            int32 feature_chunk_size = 0;
+            if (ori_feature_len > chunk_idx * chunk_stride) {
+                feature_chunk_size = std::min(
+                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
+            }
+            if (feature_chunk_size < receptive_field_length) break;
+
+            int32 start = chunk_idx * chunk_stride;
+
+            for (int row_id = 0; row_id < chunk_size; ++row_id) {
+                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, start);
+                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
+                    feature_chunk.Data() + row_id * feature.NumCols(),
+                    feature.NumCols());
+                f_chunk_tmp.CopyFromVec(tmp);
+                ++start;
+            }
+            raw_data->Accept(feature_chunk);
+            if (chunk_idx == num_chunks - 1) {
+                raw_data->SetFinished();
+            }
+            decoder.AdvanceDecode(decodable);
+        }
+        std::string result;
+        result = decoder.GetFinalBestPath();
+        decodable->Reset();
+        decoder.Reset();
+        if (result.empty()) {
+            // the TokenWriter can not write empty string.
+            ++num_err;
+            KALDI_LOG << " the result of " << utt << " is empty";
+            continue;
+        }
+        KALDI_LOG << " the result of " << utt << " is " << result;
+        result_writer.Write(utt, result);
+        ++num_done;
+    }
+
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << " cost:" << elapsed << " s";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc
new file mode 100644
index 000000000..0e249cc6b
--- /dev/null
+++ b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "decoder/ctc_beam_search_decoder.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+
+DEFINE_string(nnet_prob_respecifier, "", "test nnet prob rspecifier");
+DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
+DEFINE_string(lm_path, "lm.klm", "language model");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+// test decoder by feeding nnet posterior probability
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    kaldi::SequentialBaseFloatMatrixReader likelihood_reader(
+        FLAGS_nnet_prob_respecifier);
+    std::string dict_file = FLAGS_dict_file;
+    std::string lm_path = FLAGS_lm_path;
+    LOG(INFO) << "dict path: " << dict_file;
+    LOG(INFO) << "lm path: " << lm_path;
+
+    int32 num_done = 0, num_err = 0;
+
+    ppspeech::CTCBeamSearchOptions opts;
+    opts.dict_file = dict_file;
+    opts.lm_path = lm_path;
+    ppspeech::CTCBeamSearch decoder(opts);
+
+    std::shared_ptr<ppspeech::Decodable> decodable(
+        new ppspeech::Decodable(nullptr, nullptr));
+
+    decoder.InitDecoder();
+
+    for (; !likelihood_reader.Done(); likelihood_reader.Next()) {
+        string utt = likelihood_reader.Key();
+        const kaldi::Matrix<BaseFloat> likelihood = likelihood_reader.Value();
+        LOG(INFO) << "process utt: " << utt;
+        LOG(INFO) << "rows: " << likelihood.NumRows();
+        LOG(INFO) << "cols: " << likelihood.NumCols();
+        decodable->Acceptlikelihood(likelihood);
+        decoder.AdvanceDecode(decodable);
+        std::string result;
+        result = decoder.GetFinalBestPath();
+        KALDI_LOG << " the result of " << utt << " is " << result;
+        decodable->Reset();
+        decoder.Reset();
+        ++num_done;
+    }
+
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/decoder/recognizer_main.cc
new file mode 100644
index 000000000..7aef73f74
--- /dev/null
+++ b/speechx/speechx/decoder/recognizer_main.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/recognizer.h"
+#include "decoder/param.h"
+#include "kaldi/feat/wave-reader.h"
+#include "kaldi/util/table-types.h"
+
+DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
+DEFINE_string(result_wspecifier, "", "test result wspecifier");
+DEFINE_int32(sample_rate, 16000, "sample rate");
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure();
+    ppspeech::Recognizer recognizer(resource);
+
+    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
+        FLAGS_wav_rspecifier);
+    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
+
+    int sample_rate = FLAGS_sample_rate;
+    float streaming_chunk = FLAGS_streaming_chunk;
+    int chunk_sample_size = streaming_chunk * sample_rate;
+    LOG(INFO) << "sr: " << sample_rate;
+    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
+
+    int32 num_done = 0, num_err = 0;
+    double tot_wav_duration = 0.0;
+
+    kaldi::Timer timer;
+
+    for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string utt = wav_reader.Key();
+        const kaldi::WaveData& wave_data = wav_reader.Value();
+
+        int32 this_channel = 0;
+        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
+                                                    this_channel);
+        int tot_samples = waveform.Dim();
+        tot_wav_duration += tot_samples * 1.0 / sample_rate;
+        LOG(INFO) << "wav len (sample): " << tot_samples;
+
+        int sample_offset = 0;
+        std::vector<kaldi::Vector<BaseFloat>> feats;
+        int feature_rows = 0;
+        while (sample_offset < tot_samples) {
+            int cur_chunk_size =
+                std::min(chunk_sample_size, tot_samples - sample_offset);
+
+            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
+            for (int i = 0; i < cur_chunk_size; ++i) {
+                wav_chunk(i) = waveform(sample_offset + i);
+            }
+            // wav_chunk = waveform.Range(sample_offset + i, cur_chunk_size);
+
+            recognizer.Accept(wav_chunk);
+            if (cur_chunk_size < chunk_sample_size) {
+                recognizer.SetFinished();
+            }
+            recognizer.Decode();
+
+            // no overlap
+            sample_offset += cur_chunk_size;
+        }
+
+        std::string result;
+        result = recognizer.GetFinalResult();
+        recognizer.Reset();
+        if (result.empty()) {
+            // the TokenWriter can not write empty string.
+            ++num_err;
+            KALDI_LOG << " the result of " << utt << " is empty";
+            continue;
+        }
+        KALDI_LOG << " the result of " << utt << " is " << result;
+        result_writer.Write(utt, result);
+        ++num_done;
+    }
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Done " << num_done << " out of " << (num_err + num_done);
+    KALDI_LOG << " cost:" << elapsed << " s";
+    KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s";
+    KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration;
+}
\ No newline at end of file
diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/tlg_decoder_main.cc
new file mode 100644
index 000000000..fefc16d2c
--- /dev/null
+++ b/speechx/speechx/decoder/tlg_decoder_main.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "decoder/ctc_tlg_decoder.h"
+#include "frontend/audio/data_cache.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+#include "nnet/paddle_nnet.h"
+
+DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
+DEFINE_string(result_wspecifier, "", "test result wspecifier");
+DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
+DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
+DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
+DEFINE_string(graph_path, "TLG", "decoder graph");
+
+DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
+DEFINE_int32(max_active, 7500, "decoder graph");
+DEFINE_int32(receptive_field_length,
+             7,
+             "receptive field of two CNN(kernel=5) downsampling module.");
+DEFINE_int32(downsampling_rate,
+             4,
+             "two CNN(kernel=5) module downsampling rate.");
+DEFINE_string(
+    model_input_names,
+    "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
+    "model input names");
+DEFINE_string(model_output_names,
+              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
+              "model output names");
+DEFINE_string(model_cache_names,
+              "chunk_state_h_box,chunk_state_c_box",
+              "model cache names");
+DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+// test TLG decoder by feeding speech feature.
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    kaldi::SequentialBaseFloatMatrixReader feature_reader(
+        FLAGS_feature_rspecifier);
+    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
+    std::string model_graph = FLAGS_model_path;
+    std::string model_params = FLAGS_param_path;
+    std::string word_symbol_table = FLAGS_word_symbol_table;
+    std::string graph_path = FLAGS_graph_path;
+    LOG(INFO) << "model path: " << model_graph;
+    LOG(INFO) << "model param: " << model_params;
+    LOG(INFO) << "word symbol path: " << word_symbol_table;
+    LOG(INFO) << "graph path: " << graph_path;
+
+    int32 num_done = 0, num_err = 0;
+
+    ppspeech::TLGDecoderOptions opts;
+    opts.word_symbol_table = word_symbol_table;
+    opts.fst_path = graph_path;
+    opts.opts.max_active = FLAGS_max_active;
+    opts.opts.beam = 15.0;
+    opts.opts.lattice_beam = 7.5;
+    ppspeech::TLGDecoder decoder(opts);
+
+    ppspeech::ModelOptions model_opts;
+    model_opts.model_path = model_graph;
+    model_opts.param_path = model_params;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
+    model_opts.input_names = FLAGS_model_input_names;
+    model_opts.output_names = FLAGS_model_output_names;
+    std::shared_ptr<ppspeech::PaddleNnet> nnet(
+        new ppspeech::PaddleNnet(model_opts));
+    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
+    std::shared_ptr<ppspeech::Decodable> decodable(
+        new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
+
+    int32 chunk_size = FLAGS_receptive_field_length;
+    int32 chunk_stride = FLAGS_downsampling_rate;
+    int32 receptive_field_length = FLAGS_receptive_field_length;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;
+    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
+    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
+    decoder.InitDecoder();
+    kaldi::Timer timer;
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+        string utt = feature_reader.Key();
+        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+        raw_data->SetDim(feature.NumCols());
+        LOG(INFO) << "process utt: " << utt;
+        LOG(INFO) << "rows: " << feature.NumRows();
+        LOG(INFO) << "cols: " << feature.NumCols();
+
+        int32 row_idx = 0;
+        int32 padding_len = 0;
+        int32 ori_feature_len = feature.NumRows();
+        if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
+            padding_len =
+                chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride;
+            feature.Resize(feature.NumRows() + padding_len,
+                           feature.NumCols(),
+                           kaldi::kCopyData);
+        }
+        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
+        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
+                                                          feature.NumCols());
+            int32 feature_chunk_size = 0;
+            if (ori_feature_len > chunk_idx * chunk_stride) {
+                feature_chunk_size = std::min(
+                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
+            }
+            if (feature_chunk_size < receptive_field_length) break;
+
+            int32 start = chunk_idx * chunk_stride;
+            for (int row_id = 0; row_id < chunk_size; ++row_id) {
+                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, start);
+                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
+                    feature_chunk.Data() + row_id * feature.NumCols(),
+                    feature.NumCols());
+                f_chunk_tmp.CopyFromVec(tmp);
+                ++start;
+            }
+            raw_data->Accept(feature_chunk);
+            if (chunk_idx == num_chunks - 1) {
+                raw_data->SetFinished();
+            }
+            decoder.AdvanceDecode(decodable);
+        }
+        std::string result;
+        result = decoder.GetFinalBestPath();
+        decodable->Reset();
+        decoder.Reset();
+        if (result.empty()) {
+            // the TokenWriter can not write empty string.
+            ++num_err;
+            KALDI_LOG << " the result of " << utt << " is empty";
+            continue;
+        }
+        KALDI_LOG << " the result of " << utt << " is " << result;
+        result_writer.Write(utt, result);
+        ++num_done;
+    }
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << " cost:" << elapsed << " s";
+
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index 745832fe7..86faf8ced 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -9,5 +9,18 @@ add_library(frontend STATIC
   feature_pipeline.cc
   fbank.cc
 )
-
 target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)
+
+#target_link_libraries(${bin_name} frontend kaldi-util kaldi-feat-common gflags glog)
+
+set(BINS 
+  cmvn_json2kaldi_main
+  compute_linear_spectrogram_main
+  compute_fbank_main
+)
+
+foreach(bin_name IN LISTS BINS)
+  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+  target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog)
+endforeach()
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
new file mode 100644
index 000000000..0def14660
--- /dev/null
+++ b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Note: Do not print/log ondemand object.
+
+#include "base/common.h"
+#include "base/flags.h"
+#include "base/log.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/kaldi-io.h"
+#include "utils/file_utils.h"
+// #include "boost/json.hpp"
+#include <boost/json/src.hpp>
+
+DEFINE_string(json_file, "", "cmvn json file");
+DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
+DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
+
+using namespace boost::json;  // from <boost/json.hpp>
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    LOG(INFO) << "cmvn josn path: " << FLAGS_json_file;
+
+    auto ifs = std::ifstream(FLAGS_json_file);
+    std::string json_str = ppspeech::ReadFile2String(FLAGS_json_file);
+    auto value = boost::json::parse(json_str);
+    if (!value.is_object()) {
+        LOG(ERROR) << "Input json file format error.";
+    }
+
+    for (auto obj : value.as_object()) {
+        if (obj.key() == "mean_stat") {
+            LOG(INFO) << "mean_stat:" << obj.value();
+        }
+        if (obj.key() == "var_stat") {
+            LOG(INFO) << "var_stat: " << obj.value();
+        }
+        if (obj.key() == "frame_num") {
+            LOG(INFO) << "frame_num: " << obj.value();
+        }
+    }
+
+    boost::json::array mean_stat = value.at("mean_stat").as_array();
+    std::vector<kaldi::BaseFloat> mean_stat_vec;
+    for (auto it = mean_stat.begin(); it != mean_stat.end(); it++) {
+        mean_stat_vec.push_back(it->as_double());
+    }
+
+    boost::json::array var_stat = value.at("var_stat").as_array();
+    std::vector<kaldi::BaseFloat> var_stat_vec;
+    for (auto it = var_stat.begin(); it != var_stat.end(); it++) {
+        var_stat_vec.push_back(it->as_double());
+    }
+
+    kaldi::int32 frame_num = uint64_t(value.at("frame_num").as_int64());
+    LOG(INFO) << "nframe: " << frame_num;
+
+    size_t mean_size = mean_stat_vec.size();
+    kaldi::Matrix<double> cmvn_stats(2, mean_size + 1);
+    for (size_t idx = 0; idx < mean_size; ++idx) {
+        cmvn_stats(0, idx) = mean_stat_vec[idx];
+        cmvn_stats(1, idx) = var_stat_vec[idx];
+    }
+    cmvn_stats(0, mean_size) = frame_num;
+    LOG(INFO) << cmvn_stats;
+
+    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary);
+    LOG(INFO) << "cmvn stats have write into: " << FLAGS_cmvn_write_path;
+    LOG(INFO) << "Binary: " << FLAGS_binary;
+    return 0;
+}
diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc
new file mode 100644
index 000000000..67683eebf
--- /dev/null
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "kaldi/feat/wave-reader.h"
+#include "kaldi/util/kaldi-io.h"
+#include "kaldi/util/table-types.h"
+
+#include "frontend/audio/audio_cache.h"
+#include "frontend/audio/data_cache.h"
+#include "frontend/audio/fbank.h"
+#include "frontend/audio/feature_cache.h"
+#include "frontend/audio/frontend_itf.h"
+#include "frontend/audio/normalizer.h"
+
+DEFINE_string(wav_rspecifier, "", "test wav scp path");
+DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
+DEFINE_string(cmvn_file, "", "read cmvn");
+DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
+DEFINE_int32(num_bins, 161, "fbank num bins");
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
+        FLAGS_wav_rspecifier);
+    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
+
+    int32 num_done = 0, num_err = 0;
+
+    // feature pipeline: wave cache --> povey window
+    // -->fbank --> global cmvn -> feat cache
+
+    std::unique_ptr<ppspeech::FrontendInterface> data_source(
+        new ppspeech::AudioCache(3600 * 1600, false));
+
+    ppspeech::FbankOptions opt;
+    opt.fbank_opts.frame_opts.frame_length_ms = 25;
+    opt.fbank_opts.frame_opts.frame_shift_ms = 10;
+    opt.streaming_chunk = FLAGS_streaming_chunk;
+    opt.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
+    opt.fbank_opts.frame_opts.dither = 0.0;
+
+    std::unique_ptr<ppspeech::FrontendInterface> fbank(
+        new ppspeech::Fbank(opt, std::move(data_source)));
+
+    std::unique_ptr<ppspeech::FrontendInterface> cmvn(
+        new ppspeech::CMVN(FLAGS_cmvn_file, std::move(fbank)));
+
+    ppspeech::FeatureCacheOptions feat_cache_opts;
+    // the feature cache output feature chunk by chunk.
+    // frame_chunk_size : num frame of a chunk.
+    // frame_chunk_stride: chunk sliding window stride.
+    feat_cache_opts.frame_chunk_stride = 1;
+    feat_cache_opts.frame_chunk_size = 1;
+    ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
+    LOG(INFO) << "fbank: " << true;
+    LOG(INFO) << "feat dim: " << feature_cache.Dim();
+
+    int sample_rate = 16000;
+    float streaming_chunk = FLAGS_streaming_chunk;
+    int chunk_sample_size = streaming_chunk * sample_rate;
+    LOG(INFO) << "sr: " << sample_rate;
+    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
+
+    for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string utt = wav_reader.Key();
+        const kaldi::WaveData& wave_data = wav_reader.Value();
+        LOG(INFO) << "process utt: " << utt;
+
+        int32 this_channel = 0;
+        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
+                                                    this_channel);
+        int tot_samples = waveform.Dim();
+        LOG(INFO) << "wav len (sample): " << tot_samples;
+
+        int sample_offset = 0;
+        std::vector<kaldi::Vector<BaseFloat>> feats;
+        int feature_rows = 0;
+        while (sample_offset < tot_samples) {
+            int cur_chunk_size =
+                std::min(chunk_sample_size, tot_samples - sample_offset);
+
+            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
+            for (int i = 0; i < cur_chunk_size; ++i) {
+                wav_chunk(i) = waveform(sample_offset + i);
+            }
+
+            kaldi::Vector<BaseFloat> features;
+            feature_cache.Accept(wav_chunk);
+            if (cur_chunk_size < chunk_sample_size) {
+                feature_cache.SetFinished();
+            }
+            bool flag = true;
+            do {
+                flag = feature_cache.Read(&features);
+                feats.push_back(features);
+                feature_rows += features.Dim() / feature_cache.Dim();
+            } while (flag == true && features.Dim() != 0);
+            sample_offset += cur_chunk_size;
+        }
+
+        int cur_idx = 0;
+        kaldi::Matrix<kaldi::BaseFloat> features(feature_rows,
+                                                 feature_cache.Dim());
+        for (auto feat : feats) {
+            int num_rows = feat.Dim() / feature_cache.Dim();
+            for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
+                for (size_t col_idx = 0; col_idx < feature_cache.Dim();
+                     ++col_idx) {
+                    features(cur_idx, col_idx) =
+                        feat(row_idx * feature_cache.Dim() + col_idx);
+                }
+                ++cur_idx;
+            }
+        }
+        feat_writer.Write(utt, features);
+        feature_cache.Reset();
+
+        if (num_done % 50 == 0 && num_done != 0)
+            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
+        num_done++;
+    }
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
new file mode 100644
index 000000000..943b74b89
--- /dev/null
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "kaldi/feat/wave-reader.h"
+#include "kaldi/util/kaldi-io.h"
+#include "kaldi/util/table-types.h"
+
+#include "frontend/audio/audio_cache.h"
+#include "frontend/audio/data_cache.h"
+#include "frontend/audio/feature_cache.h"
+#include "frontend/audio/frontend_itf.h"
+#include "frontend/audio/linear_spectrogram.h"
+#include "frontend/audio/normalizer.h"
+
+DEFINE_string(wav_rspecifier, "", "test wav scp path");
+DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
+DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn");
+DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
+        FLAGS_wav_rspecifier);
+    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
+
+    int32 num_done = 0, num_err = 0;
+
+    // feature pipeline: wave cache --> hanning window
+    // -->linear_spectrogram --> global cmvn -> feat cache
+
+    std::unique_ptr<ppspeech::FrontendInterface> data_source(
+        new ppspeech::AudioCache(3600 * 1600, true));
+
+    ppspeech::LinearSpectrogramOptions opt;
+    opt.frame_opts.frame_length_ms = 20;
+    opt.frame_opts.frame_shift_ms = 10;
+    opt.streaming_chunk = FLAGS_streaming_chunk;
+    opt.frame_opts.dither = 0.0;
+    opt.frame_opts.remove_dc_offset = false;
+    opt.frame_opts.window_type = "hanning";
+    opt.frame_opts.preemph_coeff = 0.0;
+    LOG(INFO) << "linear feature: " << true;
+    LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
+    LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
+
+    std::unique_ptr<ppspeech::FrontendInterface> linear_spectrogram(
+        new ppspeech::LinearSpectrogram(opt, std::move(data_source)));
+
+    std::unique_ptr<ppspeech::FrontendInterface> cmvn(
+        new ppspeech::CMVN(FLAGS_cmvn_file, std::move(linear_spectrogram)));
+
+    ppspeech::FeatureCacheOptions feat_cache_opts;
+    // the feature cache output feature chunk by chunk.
+    // frame_chunk_size : num frame of a chunk.
+    // frame_chunk_stride: chunk sliding window stride.
+    feat_cache_opts.frame_chunk_stride = 1;
+    feat_cache_opts.frame_chunk_size = 1;
+    ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
+    LOG(INFO) << "feat dim: " << feature_cache.Dim();
+
+    int sample_rate = 16000;
+    float streaming_chunk = FLAGS_streaming_chunk;
+    int chunk_sample_size = streaming_chunk * sample_rate;
+    LOG(INFO) << "sample rate: " << sample_rate;
+    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
+
+
+    for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string utt = wav_reader.Key();
+        const kaldi::WaveData& wave_data = wav_reader.Value();
+        LOG(INFO) << "process utt: " << utt;
+
+        int32 this_channel = 0;
+        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
+                                                    this_channel);
+        int tot_samples = waveform.Dim();
+        LOG(INFO) << "wav len (sample): " << tot_samples;
+
+        int sample_offset = 0;
+        std::vector<kaldi::Vector<BaseFloat>> feats;
+        int feature_rows = 0;
+        while (sample_offset < tot_samples) {
+            int cur_chunk_size =
+                std::min(chunk_sample_size, tot_samples - sample_offset);
+
+            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
+            for (int i = 0; i < cur_chunk_size; ++i) {
+                wav_chunk(i) = waveform(sample_offset + i);
+            }
+
+            kaldi::Vector<BaseFloat> features;
+            feature_cache.Accept(wav_chunk);
+            if (cur_chunk_size < chunk_sample_size) {
+                feature_cache.SetFinished();
+            }
+            bool flag = true;
+            do {
+                flag = feature_cache.Read(&features);
+                feats.push_back(features);
+                feature_rows += features.Dim() / feature_cache.Dim();
+            } while (flag == true && features.Dim() != 0);
+            sample_offset += cur_chunk_size;
+        }
+
+        int cur_idx = 0;
+        kaldi::Matrix<kaldi::BaseFloat> features(feature_rows,
+                                                 feature_cache.Dim());
+        for (auto feat : feats) {
+            int num_rows = feat.Dim() / feature_cache.Dim();
+            for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
+                for (size_t col_idx = 0; col_idx < feature_cache.Dim();
+                     ++col_idx) {
+                    features(cur_idx, col_idx) =
+                        feat(row_idx * feature_cache.Dim() + col_idx);
+                }
+                ++cur_idx;
+            }
+        }
+        feat_writer.Write(utt, features);
+        feature_cache.Reset();
+
+        if (num_done % 50 == 0 && num_done != 0)
+            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
+        num_done++;
+    }
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/speechx/websocket/CMakeLists.txt b/speechx/speechx/websocket/CMakeLists.txt
index 582a38031..c3454c399 100644
--- a/speechx/speechx/websocket/CMakeLists.txt
+++ b/speechx/speechx/websocket/CMakeLists.txt
@@ -5,3 +5,11 @@ add_library(websocket STATIC
   websocket_client.cc
 )
 target_link_libraries(websocket PUBLIC frontend decoder nnet)
+
+add_executable(websocket_server_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_server_main.cc)
+target_include_directories(websocket_server_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(websocket_server_main PUBLIC fst websocket ${DEPS})
+
+add_executable(websocket_client_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_client_main.cc)
+target_include_directories(websocket_client_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(websocket_client_main PUBLIC fst websocket ${DEPS})
diff --git a/speechx/examples/ds2_ol/websocket/websocket_client_main.cc b/speechx/speechx/websocket/websocket_client_main.cc
similarity index 100%
rename from speechx/examples/ds2_ol/websocket/websocket_client_main.cc
rename to speechx/speechx/websocket/websocket_client_main.cc
diff --git a/speechx/examples/ds2_ol/websocket/websocket_server_main.cc b/speechx/speechx/websocket/websocket_server_main.cc
similarity index 100%
rename from speechx/examples/ds2_ol/websocket/websocket_server_main.cc
rename to speechx/speechx/websocket/websocket_server_main.cc

From be70016edba9552c4d96c26c3f1e76847db17ecb Mon Sep 17 00:00:00 2001
From: r <ryanrussell@users.noreply.github.com>
Date: Thu, 26 May 2022 16:11:05 -0500
Subject: [PATCH 15/40] Improve readability

---
 demos/README.md                  |  6 +++---
 speechx/README.md                |  6 +++---
 third_party/README.md            | 14 +++++++-------
 third_party/ctc_decoders/LICENSE |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/demos/README.md b/demos/README.md
index 8abd67249..2a306df6b 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -2,14 +2,14 @@
 
 ([简体中文](./README_cn.md)|English)
 
-The directory containes many speech applications in multi scenarios.
+This directory contains many speech applications in multiple scenarios.
 
 * audio searching - mass audio similarity retrieval
 * audio tagging - multi-label tagging of an audio file
-* automatic_video_subtitiles - generate subtitles from a video
+* automatic_video_subtitles - generate subtitles from a video
 * metaverse - 2D AR with TTS  
 * punctuation_restoration - restore punctuation from raw text
-* speech recogintion - recognize text of an audio file 
+* speech recognition - recognize text of an audio file 
 * speech server - Server for Speech Task, e.g. ASR,TTS,CLS
 * streaming asr server - receive audio stream from websocket, and recognize to transcript.
 * speech translation - end to end speech translation  
diff --git a/speechx/README.md b/speechx/README.md
index f75d8ac4e..cd1cd62c1 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -44,13 +44,13 @@ More details please see `README.md` under `examples`.
 > If using docker please check `--privileged` is set when `docker run`.
 
 * Fatal error at startup: `a function redirection which is mandatory for this platform-tool combination cannot be set up`
-```
+```bash
 apt-get install libc6-dbg
 ```
 
 * Install
 
-```
+```bash
 pushd tools
 ./setup_valgrind.sh
 popd
@@ -59,4 +59,4 @@ popd
 ## TODO
 
 ### Deepspeech2 with linear feature
-* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.
+* DecibelNormalizer: there is a small difference between the offline and online db norm. The computation of online db norm reads features chunk by chunk, which causes the feature size to be different different with offline db norm. In `normalizer.cc:73`, the `samples.size()` is different, which causes the different result.
diff --git a/third_party/README.md b/third_party/README.md
index c73df5427..843d0d3b2 100644
--- a/third_party/README.md
+++ b/third_party/README.md
@@ -1,27 +1,27 @@
 * [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)  
 commit: fc1bd6240c2008412ab64dc25045cd872f5e126c  
 ref: https://zhuanlan.zhihu.com/p/55371926  
-licence: MIT
+license: MIT
 
 * [python-pinyin](https://github.com/mozillazg/python-pinyin.git)
 commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03
-licence: MIT
+license: MIT
 
 * [zhon](https://github.com/tsroten/zhon)
 commit: 09bf543696277f71de502506984661a60d24494c
-licence: MIT
+license: MIT
 
 * [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git)
 commit: b76465045717fbb4f118c4fbdd24ce93bab10a6d
-licence: MIT
+license: MIT
 
 * [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization.git)
 commit: 9e92c7bf2d6b5a7974305406d8e240045beac51c
-licence: MIT
+license: MIT
 
 * [phkit](https://github.com/KuangDD/phkit.git)
 commit: b2100293c1e36da531d7f30bd52c9b955a649522
-licence: None
+license: None
 
 * [nnAudio](https://github.com/KinWaiCheuk/nnAudio.git)
-licence: MIT
+license: MIT
diff --git a/third_party/ctc_decoders/LICENSE b/third_party/ctc_decoders/LICENSE
index eeef74b30..ad947f8d7 100644
--- a/third_party/ctc_decoders/LICENSE
+++ b/third_party/ctc_decoders/LICENSE
@@ -5,4 +5,4 @@ score.h and score.cpp is under the LGPL license.
 The two files include the header files from KenLM project.
 
 For the rest:
-The default licence of paddlespeech-ctcdecoders is Apache License 2.0.
+The default license of paddlespeech-ctcdecoders is Apache License 2.0.

From 8373eed67f0a9b2a642e5fe1e95e084a236844ba Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 27 May 2022 00:02:53 +0000
Subject: [PATCH 16/40] fix speechx compile error

---
 speechx/examples/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/speechx/examples/CMakeLists.txt b/speechx/examples/CMakeLists.txt
index 3c274a20a..bcb23eddb 100644
--- a/speechx/examples/CMakeLists.txt
+++ b/speechx/examples/CMakeLists.txt
@@ -1,4 +1,3 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 
 add_subdirectory(ds2_ol)
-add_subdirectory(dev)
\ No newline at end of file

From 42fba661c9073c415dfd7d460bc8a510bc46359d Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 27 May 2022 00:03:19 +0000
Subject: [PATCH 17/40] more detail of copyright

---
 examples/wenetspeech/asr1/local/extract_meta.py | 16 +++-------------
 paddlespeech/kws/exps/mdtc/compute_det.py       |  2 ++
 paddlespeech/kws/exps/mdtc/plot_det_curve.py    |  2 ++
 paddlespeech/kws/exps/mdtc/score.py             |  4 +++-
 paddlespeech/kws/models/loss.py                 |  1 +
 paddlespeech/kws/models/mdtc.py                 |  1 +
 paddlespeech/s2t/io/dataset.py                  |  1 +
 paddlespeech/s2t/models/u2/u2.py                |  1 +
 paddlespeech/s2t/models/u2/updater.py           |  2 +-
 paddlespeech/s2t/utils/ctc_utils.py             |  1 +
 paddlespeech/s2t/utils/text_grid.py             |  1 +
 utils/compute-wer.py                            |  2 +-
 12 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py
index 0e1b27278..2cad977be 100644
--- a/examples/wenetspeech/asr1/local/extract_meta.py
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
@@ -1,18 +1,7 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 # Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
 #                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -24,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import argparse
 import json
 import os
diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py
index e43a953db..853056966 100644
--- a/paddlespeech/kws/exps/mdtc/compute_det.py
+++ b/paddlespeech/kws/exps/mdtc/compute_det.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+#               2022 Shaoqing Yu(954793264@qq.com)
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
index a3ea21eff..4960281ee 100644
--- a/paddlespeech/kws/exps/mdtc/plot_det_curve.py
+++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+#                    Menglong Xu
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py
index 1b5e1e296..556455ca1 100644
--- a/paddlespeech/kws/exps/mdtc/score.py
+++ b/paddlespeech/kws/exps/mdtc/score.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+#               2022 Shaoqing Yu(954793264@qq.com)
+#               2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/kws/models/loss.py b/paddlespeech/kws/models/loss.py
index 64c9a32c9..bda77f2ba 100644
--- a/paddlespeech/kws/models/loss.py
+++ b/paddlespeech/kws/models/loss.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021 Binbin Zhang
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py
index 5d2e5de64..c605a02b6 100644
--- a/paddlespeech/kws/models/mdtc.py
+++ b/paddlespeech/kws/models/mdtc.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com)
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 0e94f047b..9987b5110 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index d5471369f..b4b61666f 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/s2t/models/u2/updater.py b/paddlespeech/s2t/models/u2/updater.py
index c59090a84..898a50bf0 100644
--- a/paddlespeech/s2t/models/u2/updater.py
+++ b/paddlespeech/s2t/models/u2/updater.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Modified from wenet(https://github.com/wenet-e2e/wenet)
+
 from contextlib import nullcontext
 
 import paddle
diff --git a/paddlespeech/s2t/utils/ctc_utils.py b/paddlespeech/s2t/utils/ctc_utils.py
index 886b72033..42564d8e1 100644
--- a/paddlespeech/s2t/utils/ctc_utils.py
+++ b/paddlespeech/s2t/utils/ctc_utils.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/s2t/utils/text_grid.py b/paddlespeech/s2t/utils/text_grid.py
index cbd9856e4..e696f43d5 100644
--- a/paddlespeech/s2t/utils/text_grid.py
+++ b/paddlespeech/s2t/utils/text_grid.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/utils/compute-wer.py b/utils/compute-wer.py
index 978a80c9f..98bb24a7e 100755
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-# CopyRight WeNet Apache-2.0 License
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 import codecs
 import re
 import sys

From aa49d2539ddefc2562a6745e94fb5a54cbdf3576 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 27 May 2022 00:07:01 +0000
Subject: [PATCH 18/40] 2022 year for default copyright

---
 .pre-commit-hooks/copyright-check.hook | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-hooks/copyright-check.hook b/.pre-commit-hooks/copyright-check.hook
index 26044c29e..761edbc01 100644
--- a/.pre-commit-hooks/copyright-check.hook
+++ b/.pre-commit-hooks/copyright-check.hook
@@ -19,7 +19,7 @@ import subprocess
 import platform
 
 COPYRIGHT = '''
-Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From ae629e2fe6725412875eb31a44175d08b233e94d Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Fri, 27 May 2022 08:55:26 +0800
Subject: [PATCH 19/40] rm ds2_ol test dir

---
 speechx/examples/README.md                    |  11 +-
 speechx/examples/ds2_ol/CMakeLists.txt        |   6 -
 .../examples/ds2_ol/aishell/run_build_tlg.sh  |   2 +-
 speechx/examples/ds2_ol/aishell/run_fbank.sh  |   2 +-
 speechx/examples/ds2_ol/decoder/.gitignore    |   2 -
 .../examples/ds2_ol/decoder/CMakeLists.txt    |  22 --
 speechx/examples/ds2_ol/decoder/README.md     |  12 --
 .../ctc-prefix-beam-search-decoder-ol.cc      | 167 --------------
 .../examples/ds2_ol/decoder/local/model.sh    |   3 -
 .../decoder/nnet-logprob-decoder-test.cc      |  74 -------
 speechx/examples/ds2_ol/decoder/path.sh       |  14 --
 .../ds2_ol/decoder/recognizer_test_main.cc    |  99 ---------
 speechx/examples/ds2_ol/decoder/run.sh        |  78 -------
 speechx/examples/ds2_ol/decoder/valgrind.sh   |  26 ---
 .../ds2_ol/decoder/wfst-decoder-ol.cc         | 169 ---------------
 speechx/examples/ds2_ol/feat/.gitignore       |   2 -
 speechx/examples/ds2_ol/feat/CMakeLists.txt   |  16 --
 speechx/examples/ds2_ol/feat/README.md        |   7 -
 .../examples/ds2_ol/feat/cmvn-json2kaldi.cc   |  85 --------
 .../ds2_ol/feat/compute_fbank_main.cc         | 143 ------------
 .../feat/linear-spectrogram-wo-db-norm-ol.cc  | 147 -------------
 speechx/examples/ds2_ol/feat/path.sh          |  14 --
 speechx/examples/ds2_ol/feat/run.sh           |  57 -----
 speechx/examples/ds2_ol/feat/valgrind.sh      |  24 ---
 speechx/examples/ds2_ol/nnet/.gitignore       |   2 -
 speechx/examples/ds2_ol/nnet/CMakeLists.txt   |   6 -
 speechx/examples/ds2_ol/nnet/README.md        |   3 -
 .../examples/ds2_ol/nnet/ds2-model-ol-test.cc | 203 ------------------
 speechx/examples/ds2_ol/nnet/path.sh          |  14 --
 speechx/examples/ds2_ol/nnet/run.sh           |  38 ----
 speechx/examples/ds2_ol/nnet/valgrind.sh      |  20 --
 utils/README.md                               |   2 +-
 32 files changed, 5 insertions(+), 1465 deletions(-)
 delete mode 100644 speechx/examples/ds2_ol/CMakeLists.txt
 delete mode 100644 speechx/examples/ds2_ol/decoder/.gitignore
 delete mode 100644 speechx/examples/ds2_ol/decoder/CMakeLists.txt
 delete mode 100644 speechx/examples/ds2_ol/decoder/README.md
 delete mode 100644 speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
 delete mode 100644 speechx/examples/ds2_ol/decoder/local/model.sh
 delete mode 100644 speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc
 delete mode 100644 speechx/examples/ds2_ol/decoder/path.sh
 delete mode 100644 speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
 delete mode 100755 speechx/examples/ds2_ol/decoder/run.sh
 delete mode 100755 speechx/examples/ds2_ol/decoder/valgrind.sh
 delete mode 100644 speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
 delete mode 100644 speechx/examples/ds2_ol/feat/.gitignore
 delete mode 100644 speechx/examples/ds2_ol/feat/CMakeLists.txt
 delete mode 100644 speechx/examples/ds2_ol/feat/README.md
 delete mode 100644 speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc
 delete mode 100644 speechx/examples/ds2_ol/feat/compute_fbank_main.cc
 delete mode 100644 speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
 delete mode 100644 speechx/examples/ds2_ol/feat/path.sh
 delete mode 100755 speechx/examples/ds2_ol/feat/run.sh
 delete mode 100755 speechx/examples/ds2_ol/feat/valgrind.sh
 delete mode 100644 speechx/examples/ds2_ol/nnet/.gitignore
 delete mode 100644 speechx/examples/ds2_ol/nnet/CMakeLists.txt
 delete mode 100644 speechx/examples/ds2_ol/nnet/README.md
 delete mode 100644 speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc
 delete mode 100644 speechx/examples/ds2_ol/nnet/path.sh
 delete mode 100755 speechx/examples/ds2_ol/nnet/run.sh
 delete mode 100755 speechx/examples/ds2_ol/nnet/valgrind.sh

diff --git a/speechx/examples/README.md b/speechx/examples/README.md
index b18c88e04..1b977523c 100644
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@@ -22,14 +22,7 @@ netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host
 
 ## For Developer  
 
-> Warning: Only for developer, make sure you know what's it.
+> Reminder: Only for developer, make sure you know what's it.
 
-* dev - for speechx developer, using for test.
+* codelab - for speechx developer, using for test.
 
-## Build WFST  
-
-> Warning: Using below example when you know what's it.
-
-* text_lm - process text for build lm
-* ngram - using to build NGram ARPA lm.
-* wfst - build wfst for TLG.
diff --git a/speechx/examples/ds2_ol/CMakeLists.txt b/speechx/examples/ds2_ol/CMakeLists.txt
deleted file mode 100644
index 08c194846..000000000
--- a/speechx/examples/ds2_ol/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_subdirectory(feat)
-add_subdirectory(nnet)
-add_subdirectory(decoder)
-add_subdirectory(websocket)
diff --git a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
index 4394ac5a0..2e148657b 100755
--- a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
+++ b/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
@@ -118,7 +118,7 @@ export GLOG_logtostderr=1
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     #  TLG decoder
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/check_tlg.log \
-    recognizer_test_main \
+    recognizer_main \
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --cmvn_file=$cmvn \
         --model_path=$model_dir/avg_5.jit.pdmodel \
diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
index 130f5a8c4..6e1316774 100755
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -148,7 +148,7 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     utils/run.pl JOB=1:$nj $data/split${nj}/JOB/fbank_recognizer.log \
-    recognizer_test_main \
+    recognizer_main \
         --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
         --cmvn_file=$cmvn \
         --model_path=$model_dir/avg_5.jit.pdmodel \
diff --git a/speechx/examples/ds2_ol/decoder/.gitignore b/speechx/examples/ds2_ol/decoder/.gitignore
deleted file mode 100644
index bbd86a25b..000000000
--- a/speechx/examples/ds2_ol/decoder/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-data
-exp
diff --git a/speechx/examples/ds2_ol/decoder/CMakeLists.txt b/speechx/examples/ds2_ol/decoder/CMakeLists.txt
deleted file mode 100644
index 62dd6862e..000000000
--- a/speechx/examples/ds2_ol/decoder/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-set(bin_name ctc-prefix-beam-search-decoder-ol)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
-
-
-set(bin_name wfst-decoder-ol)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder ${DEPS})
-
-
-set(bin_name nnet-logprob-decoder-test)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
-
-add_executable(recognizer_test_main ${CMAKE_CURRENT_SOURCE_DIR}/recognizer_test_main.cc)
-target_include_directories(recognizer_test_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(recognizer_test_main PUBLIC frontend kaldi-feat-common nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder ${DEPS})
diff --git a/speechx/examples/ds2_ol/decoder/README.md b/speechx/examples/ds2_ol/decoder/README.md
deleted file mode 100644
index ead3b8e13..000000000
--- a/speechx/examples/ds2_ol/decoder/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# ASR Decoder
-
-ASR Decoder test bins. We using theses bins to test CTC BeamSearch decoder and WFST decoder.
-
-* decoder_test_main.cc 
-feed nnet output logprob, and only test decoder
-
-* offline_decoder_sliding_chunk_main.cc
-feed streaming audio feature, decode as streaming manner.
-
-* offline_wfst_decoder_main.cc
-feed streaming audio feature, decode using WFST as streaming manner.
diff --git a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
deleted file mode 100644
index eaec41b71..000000000
--- a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// todo refactor, repalce with gtest
-
-#include "base/flags.h"
-#include "base/log.h"
-#include "decoder/ctc_beam_search_decoder.h"
-#include "frontend/audio/data_cache.h"
-#include "kaldi/util/table-types.h"
-#include "nnet/decodable.h"
-#include "nnet/paddle_nnet.h"
-
-DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
-DEFINE_string(result_wspecifier, "", "test result wspecifier");
-DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
-DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
-DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
-DEFINE_string(lm_path, "", "language model");
-DEFINE_int32(receptive_field_length,
-             7,
-             "receptive field of two CNN(kernel=5) downsampling module.");
-DEFINE_int32(downsampling_rate,
-             4,
-             "two CNN(kernel=5) module downsampling rate.");
-DEFINE_string(
-    model_input_names,
-    "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
-    "model input names");
-DEFINE_string(model_output_names,
-              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
-              "model output names");
-DEFINE_string(model_cache_names,
-              "chunk_state_h_box,chunk_state_c_box",
-              "model cache names");
-DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
-
-using kaldi::BaseFloat;
-using kaldi::Matrix;
-using std::vector;
-
-// test ds2 online decoder by feeding speech feature
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    CHECK(FLAGS_result_wspecifier != "");
-    CHECK(FLAGS_feature_rspecifier != "");
-
-    kaldi::SequentialBaseFloatMatrixReader feature_reader(
-        FLAGS_feature_rspecifier);
-    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
-    std::string model_path = FLAGS_model_path;
-    std::string model_params = FLAGS_param_path;
-    std::string dict_file = FLAGS_dict_file;
-    std::string lm_path = FLAGS_lm_path;
-    LOG(INFO) << "model path: " << model_path;
-    LOG(INFO) << "model param: " << model_params;
-    LOG(INFO) << "dict path: " << dict_file;
-    LOG(INFO) << "lm path: " << lm_path;
-
-    int32 num_done = 0, num_err = 0;
-
-    ppspeech::CTCBeamSearchOptions opts;
-    opts.dict_file = dict_file;
-    opts.lm_path = lm_path;
-    ppspeech::CTCBeamSearch decoder(opts);
-
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = model_path;
-    model_opts.param_path = model_params;
-    model_opts.cache_names = FLAGS_model_cache_names;
-    model_opts.cache_shape = FLAGS_model_cache_shapes;
-    model_opts.input_names = FLAGS_model_input_names;
-    model_opts.output_names = FLAGS_model_output_names;
-    std::shared_ptr<ppspeech::PaddleNnet> nnet(
-        new ppspeech::PaddleNnet(model_opts));
-    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
-    std::shared_ptr<ppspeech::Decodable> decodable(
-        new ppspeech::Decodable(nnet, raw_data));
-
-    int32 chunk_size = FLAGS_receptive_field_length;
-    int32 chunk_stride = FLAGS_downsampling_rate;
-    int32 receptive_field_length = FLAGS_receptive_field_length;
-    LOG(INFO) << "chunk size (frame): " << chunk_size;
-    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
-    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
-    decoder.InitDecoder();
-
-    kaldi::Timer timer;
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-        string utt = feature_reader.Key();
-        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
-        raw_data->SetDim(feature.NumCols());
-        LOG(INFO) << "process utt: " << utt;
-        LOG(INFO) << "rows: " << feature.NumRows();
-        LOG(INFO) << "cols: " << feature.NumCols();
-
-        int32 row_idx = 0;
-        int32 padding_len = 0;
-        int32 ori_feature_len = feature.NumRows();
-        if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
-            padding_len =
-                chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride;
-            feature.Resize(feature.NumRows() + padding_len,
-                           feature.NumCols(),
-                           kaldi::kCopyData);
-        }
-        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
-        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
-            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
-                                                          feature.NumCols());
-            int32 feature_chunk_size = 0;
-            if (ori_feature_len > chunk_idx * chunk_stride) {
-                feature_chunk_size = std::min(
-                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
-            }
-            if (feature_chunk_size < receptive_field_length) break;
-
-            int32 start = chunk_idx * chunk_stride;
-
-            for (int row_id = 0; row_id < chunk_size; ++row_id) {
-                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, start);
-                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
-                    feature_chunk.Data() + row_id * feature.NumCols(),
-                    feature.NumCols());
-                f_chunk_tmp.CopyFromVec(tmp);
-                ++start;
-            }
-            raw_data->Accept(feature_chunk);
-            if (chunk_idx == num_chunks - 1) {
-                raw_data->SetFinished();
-            }
-            decoder.AdvanceDecode(decodable);
-        }
-        std::string result;
-        result = decoder.GetFinalBestPath();
-        decodable->Reset();
-        decoder.Reset();
-        if (result.empty()) {
-            // the TokenWriter can not write empty string.
-            ++num_err;
-            KALDI_LOG << " the result of " << utt << " is empty";
-            continue;
-        }
-        KALDI_LOG << " the result of " << utt << " is " << result;
-        result_writer.Write(utt, result);
-        ++num_done;
-    }
-
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
-              << " with errors.";
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << " cost:" << elapsed << " s";
-    return (num_done != 0 ? 0 : 1);
-}
diff --git a/speechx/examples/ds2_ol/decoder/local/model.sh b/speechx/examples/ds2_ol/decoder/local/model.sh
deleted file mode 100644
index 5c609a6cf..000000000
--- a/speechx/examples/ds2_ol/decoder/local/model.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash 
-
-
diff --git a/speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc b/speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc
deleted file mode 100644
index 0e249cc6b..000000000
--- a/speechx/examples/ds2_ol/decoder/nnet-logprob-decoder-test.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// todo refactor, repalce with gtest
-
-#include "base/flags.h"
-#include "base/log.h"
-#include "decoder/ctc_beam_search_decoder.h"
-#include "kaldi/util/table-types.h"
-#include "nnet/decodable.h"
-
-DEFINE_string(nnet_prob_respecifier, "", "test nnet prob rspecifier");
-DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
-DEFINE_string(lm_path, "lm.klm", "language model");
-
-using kaldi::BaseFloat;
-using kaldi::Matrix;
-using std::vector;
-
-// test decoder by feeding nnet posterior probability
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    kaldi::SequentialBaseFloatMatrixReader likelihood_reader(
-        FLAGS_nnet_prob_respecifier);
-    std::string dict_file = FLAGS_dict_file;
-    std::string lm_path = FLAGS_lm_path;
-    LOG(INFO) << "dict path: " << dict_file;
-    LOG(INFO) << "lm path: " << lm_path;
-
-    int32 num_done = 0, num_err = 0;
-
-    ppspeech::CTCBeamSearchOptions opts;
-    opts.dict_file = dict_file;
-    opts.lm_path = lm_path;
-    ppspeech::CTCBeamSearch decoder(opts);
-
-    std::shared_ptr<ppspeech::Decodable> decodable(
-        new ppspeech::Decodable(nullptr, nullptr));
-
-    decoder.InitDecoder();
-
-    for (; !likelihood_reader.Done(); likelihood_reader.Next()) {
-        string utt = likelihood_reader.Key();
-        const kaldi::Matrix<BaseFloat> likelihood = likelihood_reader.Value();
-        LOG(INFO) << "process utt: " << utt;
-        LOG(INFO) << "rows: " << likelihood.NumRows();
-        LOG(INFO) << "cols: " << likelihood.NumCols();
-        decodable->Acceptlikelihood(likelihood);
-        decoder.AdvanceDecode(decodable);
-        std::string result;
-        result = decoder.GetFinalBestPath();
-        KALDI_LOG << " the result of " << utt << " is " << result;
-        decodable->Reset();
-        decoder.Reset();
-        ++num_done;
-    }
-
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
-              << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-}
diff --git a/speechx/examples/ds2_ol/decoder/path.sh b/speechx/examples/ds2_ol/decoder/path.sh
deleted file mode 100644
index 8e26e6e7e..000000000
--- a/speechx/examples/ds2_ol/decoder/path.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-SPEECHX_ROOT=$PWD/../../../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
-
-SPEECHX_TOOLS=$SPEECHX_ROOT/tools
-TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
-
-export LC_AL=C
-
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc b/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
deleted file mode 100644
index 7aef73f74..000000000
--- a/speechx/examples/ds2_ol/decoder/recognizer_test_main.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "decoder/recognizer.h"
-#include "decoder/param.h"
-#include "kaldi/feat/wave-reader.h"
-#include "kaldi/util/table-types.h"
-
-DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
-DEFINE_string(result_wspecifier, "", "test result wspecifier");
-DEFINE_int32(sample_rate, 16000, "sample rate");
-
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure();
-    ppspeech::Recognizer recognizer(resource);
-
-    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
-        FLAGS_wav_rspecifier);
-    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
-
-    int sample_rate = FLAGS_sample_rate;
-    float streaming_chunk = FLAGS_streaming_chunk;
-    int chunk_sample_size = streaming_chunk * sample_rate;
-    LOG(INFO) << "sr: " << sample_rate;
-    LOG(INFO) << "chunk size (s): " << streaming_chunk;
-    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
-
-    int32 num_done = 0, num_err = 0;
-    double tot_wav_duration = 0.0;
-
-    kaldi::Timer timer;
-
-    for (; !wav_reader.Done(); wav_reader.Next()) {
-        std::string utt = wav_reader.Key();
-        const kaldi::WaveData& wave_data = wav_reader.Value();
-
-        int32 this_channel = 0;
-        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
-                                                    this_channel);
-        int tot_samples = waveform.Dim();
-        tot_wav_duration += tot_samples * 1.0 / sample_rate;
-        LOG(INFO) << "wav len (sample): " << tot_samples;
-
-        int sample_offset = 0;
-        std::vector<kaldi::Vector<BaseFloat>> feats;
-        int feature_rows = 0;
-        while (sample_offset < tot_samples) {
-            int cur_chunk_size =
-                std::min(chunk_sample_size, tot_samples - sample_offset);
-
-            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
-            for (int i = 0; i < cur_chunk_size; ++i) {
-                wav_chunk(i) = waveform(sample_offset + i);
-            }
-            // wav_chunk = waveform.Range(sample_offset + i, cur_chunk_size);
-
-            recognizer.Accept(wav_chunk);
-            if (cur_chunk_size < chunk_sample_size) {
-                recognizer.SetFinished();
-            }
-            recognizer.Decode();
-
-            // no overlap
-            sample_offset += cur_chunk_size;
-        }
-
-        std::string result;
-        result = recognizer.GetFinalResult();
-        recognizer.Reset();
-        if (result.empty()) {
-            // the TokenWriter can not write empty string.
-            ++num_err;
-            KALDI_LOG << " the result of " << utt << " is empty";
-            continue;
-        }
-        KALDI_LOG << " the result of " << utt << " is " << result;
-        result_writer.Write(utt, result);
-        ++num_done;
-    }
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Done " << num_done << " out of " << (num_err + num_done);
-    KALDI_LOG << " cost:" << elapsed << " s";
-    KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s";
-    KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration;
-}
\ No newline at end of file
diff --git a/speechx/examples/ds2_ol/decoder/run.sh b/speechx/examples/ds2_ol/decoder/run.sh
deleted file mode 100755
index 40501eb41..000000000
--- a/speechx/examples/ds2_ol/decoder/run.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-set +x
-set -e
-
-. path.sh
-
-# 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
-    pushd ${SPEECHX_ROOT} 
-    bash build.sh
-    popd
-fi
-
-# input
-mkdir -p data
-data=$PWD/data
-ckpt_dir=$data/model
-model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
-vocb_dir=$ckpt_dir/data/lang_char/
-
-lm=$data/zh_giga.no_cna_cmn.prune01244.klm
-
-# output
-exp_dir=./exp
-mkdir -p $exp_dir
-
-# 2. download model
-if [[ ! -f data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]]; then
-    mkdir -p data/model
-    pushd data/model
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-    popd
-fi
-
-# produce wav scp
-if [ ! -f data/wav.scp ]; then
-    pushd data
-    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
-    echo "utt1 " $PWD/zh.wav > wav.scp
-    popd 
-fi
-
-# download lm
-if [ ! -f $lm ]; then
-    pushd data
-    wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
-    popd
-fi
-
-feat_wspecifier=$exp_dir/feats.ark
-cmvn=$exp_dir/cmvn.ark
-
-export GLOG_logtostderr=1
-
-# dump json cmvn to kaldi
-cmvn-json2kaldi \
-    --json_file  $ckpt_dir/data/mean_std.json \
-    --cmvn_write_path $cmvn \
-    --binary=false
-echo "convert json cmvn to kaldi ark."
-
-
-# generate linear feature as streaming
-linear-spectrogram-wo-db-norm-ol \
-    --wav_rspecifier=scp:$data/wav.scp \
-    --feature_wspecifier=ark,t:$feat_wspecifier \
-    --cmvn_file=$cmvn
-echo "compute linear spectrogram feature."
-
-# run ctc beam search decoder as streaming
-ctc-prefix-beam-search-decoder-ol \
-    --result_wspecifier=ark,t:$exp_dir/result.txt \
-    --feature_rspecifier=ark:$feat_wspecifier \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --param_path=$model_dir/avg_1.jit.pdiparams \
-    --dict_file=$vocb_dir/vocab.txt \
-    --lm_path=$lm
\ No newline at end of file
diff --git a/speechx/examples/ds2_ol/decoder/valgrind.sh b/speechx/examples/ds2_ol/decoder/valgrind.sh
deleted file mode 100755
index 14efe0ba4..000000000
--- a/speechx/examples/ds2_ol/decoder/valgrind.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-# this script is for memory check, so please run ./run.sh first.
-
-set +x
-set -e
-
-. ./path.sh
-
-if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
-  echo "please install valgrind in the speechx tools dir.\n" 
-  exit 1
-fi
-
-model_dir=../paddle_asr_model
-feat_wspecifier=./feats.ark
-cmvn=./cmvn.ark
-
-valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
-  offline_decoder_main \
-  --feature_respecifier=ark:$feat_wspecifier \
-  --model_path=$model_dir/avg_1.jit.pdmodel \
-  --param_path=$model_dir/avg_1.jit.pdparams \
-  --dict_file=$model_dir/vocab.txt \
-  --lm_path=$model_dir/avg_1.jit.klm
-
diff --git a/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc b/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
deleted file mode 100644
index fefc16d2c..000000000
--- a/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// todo refactor, repalce with gtest
-
-#include "base/flags.h"
-#include "base/log.h"
-#include "decoder/ctc_tlg_decoder.h"
-#include "frontend/audio/data_cache.h"
-#include "kaldi/util/table-types.h"
-#include "nnet/decodable.h"
-#include "nnet/paddle_nnet.h"
-
-DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
-DEFINE_string(result_wspecifier, "", "test result wspecifier");
-DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
-DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
-DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
-DEFINE_string(graph_path, "TLG", "decoder graph");
-
-DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
-DEFINE_int32(max_active, 7500, "decoder graph");
-DEFINE_int32(receptive_field_length,
-             7,
-             "receptive field of two CNN(kernel=5) downsampling module.");
-DEFINE_int32(downsampling_rate,
-             4,
-             "two CNN(kernel=5) module downsampling rate.");
-DEFINE_string(
-    model_input_names,
-    "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box",
-    "model input names");
-DEFINE_string(model_output_names,
-              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
-              "model output names");
-DEFINE_string(model_cache_names,
-              "chunk_state_h_box,chunk_state_c_box",
-              "model cache names");
-DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
-
-using kaldi::BaseFloat;
-using kaldi::Matrix;
-using std::vector;
-
-// test TLG decoder by feeding speech feature.
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    kaldi::SequentialBaseFloatMatrixReader feature_reader(
-        FLAGS_feature_rspecifier);
-    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
-    std::string model_graph = FLAGS_model_path;
-    std::string model_params = FLAGS_param_path;
-    std::string word_symbol_table = FLAGS_word_symbol_table;
-    std::string graph_path = FLAGS_graph_path;
-    LOG(INFO) << "model path: " << model_graph;
-    LOG(INFO) << "model param: " << model_params;
-    LOG(INFO) << "word symbol path: " << word_symbol_table;
-    LOG(INFO) << "graph path: " << graph_path;
-
-    int32 num_done = 0, num_err = 0;
-
-    ppspeech::TLGDecoderOptions opts;
-    opts.word_symbol_table = word_symbol_table;
-    opts.fst_path = graph_path;
-    opts.opts.max_active = FLAGS_max_active;
-    opts.opts.beam = 15.0;
-    opts.opts.lattice_beam = 7.5;
-    ppspeech::TLGDecoder decoder(opts);
-
-    ppspeech::ModelOptions model_opts;
-    model_opts.model_path = model_graph;
-    model_opts.param_path = model_params;
-    model_opts.cache_names = FLAGS_model_cache_names;
-    model_opts.cache_shape = FLAGS_model_cache_shapes;
-    model_opts.input_names = FLAGS_model_input_names;
-    model_opts.output_names = FLAGS_model_output_names;
-    std::shared_ptr<ppspeech::PaddleNnet> nnet(
-        new ppspeech::PaddleNnet(model_opts));
-    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
-    std::shared_ptr<ppspeech::Decodable> decodable(
-        new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
-
-    int32 chunk_size = FLAGS_receptive_field_length;
-    int32 chunk_stride = FLAGS_downsampling_rate;
-    int32 receptive_field_length = FLAGS_receptive_field_length;
-    LOG(INFO) << "chunk size (frame): " << chunk_size;
-    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
-    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
-    decoder.InitDecoder();
-    kaldi::Timer timer;
-    for (; !feature_reader.Done(); feature_reader.Next()) {
-        string utt = feature_reader.Key();
-        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
-        raw_data->SetDim(feature.NumCols());
-        LOG(INFO) << "process utt: " << utt;
-        LOG(INFO) << "rows: " << feature.NumRows();
-        LOG(INFO) << "cols: " << feature.NumCols();
-
-        int32 row_idx = 0;
-        int32 padding_len = 0;
-        int32 ori_feature_len = feature.NumRows();
-        if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
-            padding_len =
-                chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride;
-            feature.Resize(feature.NumRows() + padding_len,
-                           feature.NumCols(),
-                           kaldi::kCopyData);
-        }
-        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
-        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
-            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
-                                                          feature.NumCols());
-            int32 feature_chunk_size = 0;
-            if (ori_feature_len > chunk_idx * chunk_stride) {
-                feature_chunk_size = std::min(
-                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
-            }
-            if (feature_chunk_size < receptive_field_length) break;
-
-            int32 start = chunk_idx * chunk_stride;
-            for (int row_id = 0; row_id < chunk_size; ++row_id) {
-                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, start);
-                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
-                    feature_chunk.Data() + row_id * feature.NumCols(),
-                    feature.NumCols());
-                f_chunk_tmp.CopyFromVec(tmp);
-                ++start;
-            }
-            raw_data->Accept(feature_chunk);
-            if (chunk_idx == num_chunks - 1) {
-                raw_data->SetFinished();
-            }
-            decoder.AdvanceDecode(decodable);
-        }
-        std::string result;
-        result = decoder.GetFinalBestPath();
-        decodable->Reset();
-        decoder.Reset();
-        if (result.empty()) {
-            // the TokenWriter can not write empty string.
-            ++num_err;
-            KALDI_LOG << " the result of " << utt << " is empty";
-            continue;
-        }
-        KALDI_LOG << " the result of " << utt << " is " << result;
-        result_writer.Write(utt, result);
-        ++num_done;
-    }
-
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << " cost:" << elapsed << " s";
-
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
-              << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-}
diff --git a/speechx/examples/ds2_ol/feat/.gitignore b/speechx/examples/ds2_ol/feat/.gitignore
deleted file mode 100644
index 566f2d97b..000000000
--- a/speechx/examples/ds2_ol/feat/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-exp
-data
diff --git a/speechx/examples/ds2_ol/feat/CMakeLists.txt b/speechx/examples/ds2_ol/feat/CMakeLists.txt
deleted file mode 100644
index 632f22e85..000000000
--- a/speechx/examples/ds2_ol/feat/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-set(bin_name linear-spectrogram-wo-db-norm-ol)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} frontend kaldi-util kaldi-feat-common gflags glog)
-
-set(bin_name compute_fbank_main)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} frontend kaldi-util kaldi-feat-common gflags glog)
-
-set(bin_name cmvn-json2kaldi)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog)
diff --git a/speechx/examples/ds2_ol/feat/README.md b/speechx/examples/ds2_ol/feat/README.md
deleted file mode 100644
index 89cb79eca..000000000
--- a/speechx/examples/ds2_ol/feat/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Deepspeech2 Straming Audio Feature
-
-ASR audio feature test bins. We using theses bins to test linaer/fbank/mfcc asr feature as streaming manner.
-
-* linear_spectrogram_without_db_norm_main.cc
-
-compute linear spectrogram w/o db norm in streaming manner.
diff --git a/speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc b/speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc
deleted file mode 100644
index 0def14660..000000000
--- a/speechx/examples/ds2_ol/feat/cmvn-json2kaldi.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Note: Do not print/log ondemand object.
-
-#include "base/common.h"
-#include "base/flags.h"
-#include "base/log.h"
-#include "kaldi/matrix/kaldi-matrix.h"
-#include "kaldi/util/kaldi-io.h"
-#include "utils/file_utils.h"
-// #include "boost/json.hpp"
-#include <boost/json/src.hpp>
-
-DEFINE_string(json_file, "", "cmvn json file");
-DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
-DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)");
-
-using namespace boost::json;  // from <boost/json.hpp>
-
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    LOG(INFO) << "cmvn josn path: " << FLAGS_json_file;
-
-    auto ifs = std::ifstream(FLAGS_json_file);
-    std::string json_str = ppspeech::ReadFile2String(FLAGS_json_file);
-    auto value = boost::json::parse(json_str);
-    if (!value.is_object()) {
-        LOG(ERROR) << "Input json file format error.";
-    }
-
-    for (auto obj : value.as_object()) {
-        if (obj.key() == "mean_stat") {
-            LOG(INFO) << "mean_stat:" << obj.value();
-        }
-        if (obj.key() == "var_stat") {
-            LOG(INFO) << "var_stat: " << obj.value();
-        }
-        if (obj.key() == "frame_num") {
-            LOG(INFO) << "frame_num: " << obj.value();
-        }
-    }
-
-    boost::json::array mean_stat = value.at("mean_stat").as_array();
-    std::vector<kaldi::BaseFloat> mean_stat_vec;
-    for (auto it = mean_stat.begin(); it != mean_stat.end(); it++) {
-        mean_stat_vec.push_back(it->as_double());
-    }
-
-    boost::json::array var_stat = value.at("var_stat").as_array();
-    std::vector<kaldi::BaseFloat> var_stat_vec;
-    for (auto it = var_stat.begin(); it != var_stat.end(); it++) {
-        var_stat_vec.push_back(it->as_double());
-    }
-
-    kaldi::int32 frame_num = uint64_t(value.at("frame_num").as_int64());
-    LOG(INFO) << "nframe: " << frame_num;
-
-    size_t mean_size = mean_stat_vec.size();
-    kaldi::Matrix<double> cmvn_stats(2, mean_size + 1);
-    for (size_t idx = 0; idx < mean_size; ++idx) {
-        cmvn_stats(0, idx) = mean_stat_vec[idx];
-        cmvn_stats(1, idx) = var_stat_vec[idx];
-    }
-    cmvn_stats(0, mean_size) = frame_num;
-    LOG(INFO) << cmvn_stats;
-
-    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary);
-    LOG(INFO) << "cmvn stats have write into: " << FLAGS_cmvn_write_path;
-    LOG(INFO) << "Binary: " << FLAGS_binary;
-    return 0;
-}
diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
deleted file mode 100644
index 67683eebf..000000000
--- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// todo refactor, repalce with gtest
-
-#include "base/flags.h"
-#include "base/log.h"
-#include "kaldi/feat/wave-reader.h"
-#include "kaldi/util/kaldi-io.h"
-#include "kaldi/util/table-types.h"
-
-#include "frontend/audio/audio_cache.h"
-#include "frontend/audio/data_cache.h"
-#include "frontend/audio/fbank.h"
-#include "frontend/audio/feature_cache.h"
-#include "frontend/audio/frontend_itf.h"
-#include "frontend/audio/normalizer.h"
-
-DEFINE_string(wav_rspecifier, "", "test wav scp path");
-DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
-DEFINE_string(cmvn_file, "", "read cmvn");
-DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
-DEFINE_int32(num_bins, 161, "fbank num bins");
-
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
-        FLAGS_wav_rspecifier);
-    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
-
-    int32 num_done = 0, num_err = 0;
-
-    // feature pipeline: wave cache --> povey window
-    // -->fbank --> global cmvn -> feat cache
-
-    std::unique_ptr<ppspeech::FrontendInterface> data_source(
-        new ppspeech::AudioCache(3600 * 1600, false));
-
-    ppspeech::FbankOptions opt;
-    opt.fbank_opts.frame_opts.frame_length_ms = 25;
-    opt.fbank_opts.frame_opts.frame_shift_ms = 10;
-    opt.streaming_chunk = FLAGS_streaming_chunk;
-    opt.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
-    opt.fbank_opts.frame_opts.dither = 0.0;
-
-    std::unique_ptr<ppspeech::FrontendInterface> fbank(
-        new ppspeech::Fbank(opt, std::move(data_source)));
-
-    std::unique_ptr<ppspeech::FrontendInterface> cmvn(
-        new ppspeech::CMVN(FLAGS_cmvn_file, std::move(fbank)));
-
-    ppspeech::FeatureCacheOptions feat_cache_opts;
-    // the feature cache output feature chunk by chunk.
-    // frame_chunk_size : num frame of a chunk.
-    // frame_chunk_stride: chunk sliding window stride.
-    feat_cache_opts.frame_chunk_stride = 1;
-    feat_cache_opts.frame_chunk_size = 1;
-    ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
-    LOG(INFO) << "fbank: " << true;
-    LOG(INFO) << "feat dim: " << feature_cache.Dim();
-
-    int sample_rate = 16000;
-    float streaming_chunk = FLAGS_streaming_chunk;
-    int chunk_sample_size = streaming_chunk * sample_rate;
-    LOG(INFO) << "sr: " << sample_rate;
-    LOG(INFO) << "chunk size (s): " << streaming_chunk;
-    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
-
-    for (; !wav_reader.Done(); wav_reader.Next()) {
-        std::string utt = wav_reader.Key();
-        const kaldi::WaveData& wave_data = wav_reader.Value();
-        LOG(INFO) << "process utt: " << utt;
-
-        int32 this_channel = 0;
-        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
-                                                    this_channel);
-        int tot_samples = waveform.Dim();
-        LOG(INFO) << "wav len (sample): " << tot_samples;
-
-        int sample_offset = 0;
-        std::vector<kaldi::Vector<BaseFloat>> feats;
-        int feature_rows = 0;
-        while (sample_offset < tot_samples) {
-            int cur_chunk_size =
-                std::min(chunk_sample_size, tot_samples - sample_offset);
-
-            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
-            for (int i = 0; i < cur_chunk_size; ++i) {
-                wav_chunk(i) = waveform(sample_offset + i);
-            }
-
-            kaldi::Vector<BaseFloat> features;
-            feature_cache.Accept(wav_chunk);
-            if (cur_chunk_size < chunk_sample_size) {
-                feature_cache.SetFinished();
-            }
-            bool flag = true;
-            do {
-                flag = feature_cache.Read(&features);
-                feats.push_back(features);
-                feature_rows += features.Dim() / feature_cache.Dim();
-            } while (flag == true && features.Dim() != 0);
-            sample_offset += cur_chunk_size;
-        }
-
-        int cur_idx = 0;
-        kaldi::Matrix<kaldi::BaseFloat> features(feature_rows,
-                                                 feature_cache.Dim());
-        for (auto feat : feats) {
-            int num_rows = feat.Dim() / feature_cache.Dim();
-            for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
-                for (size_t col_idx = 0; col_idx < feature_cache.Dim();
-                     ++col_idx) {
-                    features(cur_idx, col_idx) =
-                        feat(row_idx * feature_cache.Dim() + col_idx);
-                }
-                ++cur_idx;
-            }
-        }
-        feat_writer.Write(utt, features);
-        feature_cache.Reset();
-
-        if (num_done % 50 == 0 && num_done != 0)
-            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
-        num_done++;
-    }
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
-              << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-}
diff --git a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
deleted file mode 100644
index bbf0e6908..000000000
--- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// todo refactor, repalce with gtest
-
-#include "base/flags.h"
-#include "base/log.h"
-#include "kaldi/feat/wave-reader.h"
-#include "kaldi/util/kaldi-io.h"
-#include "kaldi/util/table-types.h"
-
-#include "frontend/audio/audio_cache.h"
-#include "frontend/audio/data_cache.h"
-#include "frontend/audio/feature_cache.h"
-#include "frontend/audio/frontend_itf.h"
-#include "frontend/audio/linear_spectrogram.h"
-#include "frontend/audio/normalizer.h"
-
-DEFINE_string(wav_rspecifier, "", "test wav scp path");
-DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
-DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn");
-DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
-
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
-        FLAGS_wav_rspecifier);
-    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
-
-    int32 num_done = 0, num_err = 0;
-
-    // feature pipeline: wave cache --> hanning window
-    // -->linear_spectrogram --> global cmvn -> feat cache
-
-    std::unique_ptr<ppspeech::FrontendInterface> data_source(
-        new ppspeech::AudioCache(3600 * 1600, true));
-
-    ppspeech::LinearSpectrogramOptions opt;
-    opt.frame_opts.frame_length_ms = 20;
-    opt.frame_opts.frame_shift_ms = 10;
-    opt.streaming_chunk = FLAGS_streaming_chunk;
-    opt.frame_opts.dither = 0.0;
-    opt.frame_opts.remove_dc_offset = false;
-    opt.frame_opts.window_type = "hanning";
-    opt.frame_opts.preemph_coeff = 0.0;
-    LOG(INFO) << "linear feature: " << true;
-    LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
-    LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
-
-    std::unique_ptr<ppspeech::FrontendInterface> linear_spectrogram(
-        new ppspeech::LinearSpectrogram(opt, std::move(data_source)));
-
-    std::unique_ptr<ppspeech::FrontendInterface> cmvn(
-        new ppspeech::CMVN(FLAGS_cmvn_file, std::move(linear_spectrogram)));
-
-    ppspeech::FeatureCacheOptions feat_cache_opts;
-    // the feature cache output feature chunk by chunk.
-    // frame_chunk_size : num frame of a chunk.
-    // frame_chunk_stride: chunk sliding window stride.
-    feat_cache_opts.frame_chunk_stride = 1;
-    feat_cache_opts.frame_chunk_size = 1;
-    ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
-    LOG(INFO) << "feat dim: " << feature_cache.Dim();
-
-    int sample_rate = 16000;
-    float streaming_chunk = FLAGS_streaming_chunk;
-    int chunk_sample_size = streaming_chunk * sample_rate;
-    LOG(INFO) << "sample rate: " << sample_rate;
-    LOG(INFO) << "chunk size (s): " << streaming_chunk;
-    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
-
-
-    for (; !wav_reader.Done(); wav_reader.Next()) {
-        std::string utt = wav_reader.Key();
-        const kaldi::WaveData& wave_data = wav_reader.Value();
-        LOG(INFO) << "process utt: " << utt;
-
-        int32 this_channel = 0;
-        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
-                                                    this_channel);
-        int tot_samples = waveform.Dim();
-        LOG(INFO) << "wav len (sample): " << tot_samples;
-
-        int sample_offset = 0;
-        std::vector<kaldi::Vector<BaseFloat>> feats;
-        int feature_rows = 0;
-        while (sample_offset < tot_samples) {
-            int cur_chunk_size =
-                std::min(chunk_sample_size, tot_samples - sample_offset);
-
-            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
-            for (int i = 0; i < cur_chunk_size; ++i) {
-                wav_chunk(i) = waveform(sample_offset + i);
-            }
-
-            kaldi::Vector<BaseFloat> features;
-            feature_cache.Accept(wav_chunk);
-            if (cur_chunk_size < chunk_sample_size) {
-                feature_cache.SetFinished();
-            }
-            bool flag = true;
-            do {
-                flag = feature_cache.Read(&features);
-                feats.push_back(features);
-                feature_rows += features.Dim() / feature_cache.Dim();
-            } while (flag == true && features.Dim() != 0);
-            sample_offset += cur_chunk_size;
-        }
-
-        int cur_idx = 0;
-        kaldi::Matrix<kaldi::BaseFloat> features(feature_rows,
-                                                 feature_cache.Dim());
-        for (auto feat : feats) {
-            int num_rows = feat.Dim() / feature_cache.Dim();
-            for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
-                for (size_t col_idx = 0; col_idx < feature_cache.Dim();
-                     ++col_idx) {
-                    features(cur_idx, col_idx) =
-                        feat(row_idx * feature_cache.Dim() + col_idx);
-                }
-                ++cur_idx;
-            }
-        }
-        feat_writer.Write(utt, features);
-        feature_cache.Reset();
-
-        if (num_done % 50 == 0 && num_done != 0)
-            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
-        num_done++;
-    }
-    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
-              << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-}
diff --git a/speechx/examples/ds2_ol/feat/path.sh b/speechx/examples/ds2_ol/feat/path.sh
deleted file mode 100644
index ad2b6a4e9..000000000
--- a/speechx/examples/ds2_ol/feat/path.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-SPEECHX_ROOT=$PWD/../../../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
-
-SPEECHX_TOOLS=$SPEECHX_ROOT/tools
-TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
-
-export LC_AL=C
-
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/feat
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/ds2_ol/feat/run.sh b/speechx/examples/ds2_ol/feat/run.sh
deleted file mode 100755
index 757779275..000000000
--- a/speechx/examples/ds2_ol/feat/run.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/bin/bash
-set +x
-set -e
-
-. ./path.sh
-
-# 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
-    pushd ${SPEECHX_ROOT} 
-    bash build.sh
-    popd
-fi
-
-# 2. download model
-if [ ! -e data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]; then
-    mkdir -p data/model
-    pushd data/model
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-    popd
-fi
-
-# produce wav scp
-if [ ! -f data/wav.scp ]; then
-    mkdir -p data
-    pushd data
-    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
-    echo "utt1 " $PWD/zh.wav > wav.scp
-    popd 
-fi
-
-
-# input
-data_dir=./data
-exp_dir=./exp
-model_dir=$data_dir/model/
-
-mkdir -p $exp_dir
-
-
-# 3. run feat
-export GLOG_logtostderr=1
-
-cmvn-json2kaldi \
-    --json_file  $model_dir/data/mean_std.json \
-    --cmvn_write_path $exp_dir/cmvn.ark \
-    --binary=false
-echo "convert json cmvn to kaldi ark."
-
-
-linear-spectrogram-wo-db-norm-ol \
-    --wav_rspecifier=scp:$data_dir/wav.scp \
-    --feature_wspecifier=ark,t:$exp_dir/feats.ark \
-    --cmvn_file=$exp_dir/cmvn.ark
-echo "compute linear spectrogram feature."
-
-
diff --git a/speechx/examples/ds2_ol/feat/valgrind.sh b/speechx/examples/ds2_ol/feat/valgrind.sh
deleted file mode 100755
index f8aab63f8..000000000
--- a/speechx/examples/ds2_ol/feat/valgrind.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-# this script is for memory check, so please run ./run.sh first.
-
-set +x
-set -e
-
-. ./path.sh
-
-if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
-  echo "please install valgrind in the speechx tools dir.\n" 
-  exit 1
-fi
-
-model_dir=../paddle_asr_model
-feat_wspecifier=./feats.ark
-cmvn=./cmvn.ark
-
-valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
-  linear_spectrogram_main \
-  --wav_rspecifier=scp:$model_dir/wav.scp \
-  --feature_wspecifier=ark,t:$feat_wspecifier \
-  --cmvn_write_path=$cmvn
-
diff --git a/speechx/examples/ds2_ol/nnet/.gitignore b/speechx/examples/ds2_ol/nnet/.gitignore
deleted file mode 100644
index bbd86a25b..000000000
--- a/speechx/examples/ds2_ol/nnet/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-data
-exp
diff --git a/speechx/examples/ds2_ol/nnet/CMakeLists.txt b/speechx/examples/ds2_ol/nnet/CMakeLists.txt
deleted file mode 100644
index 6745a51ae..000000000
--- a/speechx/examples/ds2_ol/nnet/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-set(bin_name ds2-model-ol-test)
-add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-target_link_libraries(${bin_name} PUBLIC nnet gflags glog ${DEPS})
\ No newline at end of file
diff --git a/speechx/examples/ds2_ol/nnet/README.md b/speechx/examples/ds2_ol/nnet/README.md
deleted file mode 100644
index 772a58f0e..000000000
--- a/speechx/examples/ds2_ol/nnet/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Deepspeech2 Streaming NNet Test
-
-Using for ds2 streaming nnet inference test.
diff --git a/speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc b/speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc
deleted file mode 100644
index 283466dc1..000000000
--- a/speechx/examples/ds2_ol/nnet/ds2-model-ol-test.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// deepspeech2 online model info
-
-#include <algorithm>
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <iterator>
-#include <numeric>
-#include <thread>
-#include "base/flags.h"
-#include "base/log.h"
-#include "paddle_inference_api.h"
-
-using std::cout;
-using std::endl;
-
-
-DEFINE_string(model_path, "", "xxx.pdmodel");
-DEFINE_string(param_path, "", "xxx.pdiparams");
-DEFINE_int32(chunk_size, 35, "feature chunk size, unit:frame");
-DEFINE_int32(feat_dim, 161, "feature dim");
-
-
-void produce_data(std::vector<std::vector<float>>* data);
-void model_forward_test();
-
-void produce_data(std::vector<std::vector<float>>* data) {
-    int chunk_size = FLAGS_chunk_size;  // chunk_size in frame
-    int col_size = FLAGS_feat_dim;      // feat dim
-    cout << "chunk size: " << chunk_size << endl;
-    cout << "feat dim: " << col_size << endl;
-
-    data->reserve(chunk_size);
-    data->back().reserve(col_size);
-    for (int row = 0; row < chunk_size; ++row) {
-        data->push_back(std::vector<float>());
-        for (int col_idx = 0; col_idx < col_size; ++col_idx) {
-            data->back().push_back(0.201);
-        }
-    }
-}
-
-void model_forward_test() {
-    std::cout << "1. read the data" << std::endl;
-    std::vector<std::vector<float>> feats;
-    produce_data(&feats);
-
-    std::cout << "2. load the model" << std::endl;
-    ;
-    std::string model_graph = FLAGS_model_path;
-    std::string model_params = FLAGS_param_path;
-    CHECK(model_graph != "");
-    CHECK(model_params != "");
-    cout << "model path: " << model_graph << endl;
-    cout << "model param path : " << model_params << endl;
-
-    paddle_infer::Config config;
-    config.SetModel(model_graph, model_params);
-    config.SwitchIrOptim(false);
-    cout << "SwitchIrOptim: " << false << endl;
-    config.DisableFCPadding();
-    cout << "DisableFCPadding: " << endl;
-    auto predictor = paddle_infer::CreatePredictor(config);
-
-    std::cout << "3. feat shape, row=" << feats.size()
-              << ",col=" << feats[0].size() << std::endl;
-    std::vector<float> pp_input_mat;
-    for (const auto& item : feats) {
-        pp_input_mat.insert(pp_input_mat.end(), item.begin(), item.end());
-    }
-
-    std::cout << "4. fead the data to model" << std::endl;
-    int row = feats.size();
-    int col = feats[0].size();
-    std::vector<std::string> input_names = predictor->GetInputNames();
-    std::vector<std::string> output_names = predictor->GetOutputNames();
-    for (auto name : input_names) {
-        cout << "model input names: " << name << endl;
-    }
-    for (auto name : output_names) {
-        cout << "model output names: " << name << endl;
-    }
-
-    // input
-    std::unique_ptr<paddle_infer::Tensor> input_tensor =
-        predictor->GetInputHandle(input_names[0]);
-    std::vector<int> INPUT_SHAPE = {1, row, col};
-    input_tensor->Reshape(INPUT_SHAPE);
-    input_tensor->CopyFromCpu(pp_input_mat.data());
-
-    // input length
-    std::unique_ptr<paddle_infer::Tensor> input_len =
-        predictor->GetInputHandle(input_names[1]);
-    std::vector<int> input_len_size = {1};
-    input_len->Reshape(input_len_size);
-    std::vector<int64_t> audio_len;
-    audio_len.push_back(row);
-    input_len->CopyFromCpu(audio_len.data());
-
-    // state_h
-    std::unique_ptr<paddle_infer::Tensor> chunk_state_h_box =
-        predictor->GetInputHandle(input_names[2]);
-    std::vector<int> chunk_state_h_box_shape = {5, 1, 1024};
-    chunk_state_h_box->Reshape(chunk_state_h_box_shape);
-    int chunk_state_h_box_size =
-        std::accumulate(chunk_state_h_box_shape.begin(),
-                        chunk_state_h_box_shape.end(),
-                        1,
-                        std::multiplies<int>());
-    std::vector<float> chunk_state_h_box_data(chunk_state_h_box_size, 0.0f);
-    chunk_state_h_box->CopyFromCpu(chunk_state_h_box_data.data());
-
-    // state_c
-    std::unique_ptr<paddle_infer::Tensor> chunk_state_c_box =
-        predictor->GetInputHandle(input_names[3]);
-    std::vector<int> chunk_state_c_box_shape = {5, 1, 1024};
-    chunk_state_c_box->Reshape(chunk_state_c_box_shape);
-    int chunk_state_c_box_size =
-        std::accumulate(chunk_state_c_box_shape.begin(),
-                        chunk_state_c_box_shape.end(),
-                        1,
-                        std::multiplies<int>());
-    std::vector<float> chunk_state_c_box_data(chunk_state_c_box_size, 0.0f);
-    chunk_state_c_box->CopyFromCpu(chunk_state_c_box_data.data());
-
-    // run
-    bool success = predictor->Run();
-
-    // state_h out
-    std::unique_ptr<paddle_infer::Tensor> h_out =
-        predictor->GetOutputHandle(output_names[2]);
-    std::vector<int> h_out_shape = h_out->shape();
-    int h_out_size = std::accumulate(
-        h_out_shape.begin(), h_out_shape.end(), 1, std::multiplies<int>());
-    std::vector<float> h_out_data(h_out_size);
-    h_out->CopyToCpu(h_out_data.data());
-
-    // stage_c out
-    std::unique_ptr<paddle_infer::Tensor> c_out =
-        predictor->GetOutputHandle(output_names[3]);
-    std::vector<int> c_out_shape = c_out->shape();
-    int c_out_size = std::accumulate(
-        c_out_shape.begin(), c_out_shape.end(), 1, std::multiplies<int>());
-    std::vector<float> c_out_data(c_out_size);
-    c_out->CopyToCpu(c_out_data.data());
-
-    // output tensor
-    std::unique_ptr<paddle_infer::Tensor> output_tensor =
-        predictor->GetOutputHandle(output_names[0]);
-    std::vector<int> output_shape = output_tensor->shape();
-    std::vector<float> output_probs;
-    int output_size = std::accumulate(
-        output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
-    output_probs.resize(output_size);
-    output_tensor->CopyToCpu(output_probs.data());
-    row = output_shape[1];
-    col = output_shape[2];
-
-    // probs
-    std::vector<std::vector<float>> probs;
-    probs.reserve(row);
-    for (int i = 0; i < row; i++) {
-        probs.push_back(std::vector<float>());
-        probs.back().reserve(col);
-
-        for (int j = 0; j < col; j++) {
-            probs.back().push_back(output_probs[i * col + j]);
-        }
-    }
-
-    std::vector<std::vector<float>> log_feat = probs;
-    std::cout << "probs, row: " << log_feat.size()
-              << " col: " << log_feat[0].size() << std::endl;
-    for (size_t row_idx = 0; row_idx < log_feat.size(); ++row_idx) {
-        for (size_t col_idx = 0; col_idx < log_feat[row_idx].size();
-             ++col_idx) {
-            std::cout << log_feat[row_idx][col_idx] << " ";
-        }
-        std::cout << std::endl;
-    }
-}
-
-int main(int argc, char* argv[]) {
-    gflags::ParseCommandLineFlags(&argc, &argv, false);
-    google::InitGoogleLogging(argv[0]);
-
-    model_forward_test();
-    return 0;
-}
diff --git a/speechx/examples/ds2_ol/nnet/path.sh b/speechx/examples/ds2_ol/nnet/path.sh
deleted file mode 100644
index 0ee8b4787..000000000
--- a/speechx/examples/ds2_ol/nnet/path.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-SPEECHX_ROOT=$PWD/../../../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
-
-SPEECHX_TOOLS=$SPEECHX_ROOT/tools
-TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
-
-export LC_AL=C
-
-SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/nnet
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/ds2_ol/nnet/run.sh b/speechx/examples/ds2_ol/nnet/run.sh
deleted file mode 100755
index 10029f7e8..000000000
--- a/speechx/examples/ds2_ol/nnet/run.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-set +x
-set -e
-
-. path.sh
-
-# 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
-    pushd ${SPEECHX_ROOT} 
-    bash build.sh
-    popd
-fi
-
-# 2. download model
-if [ ! -f data/model/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz ]; then
-    mkdir -p data/model
-    pushd data/model
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-    tar xzfv asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-    popd
-fi
-
-# produce wav scp
-if [ ! -f data/wav.scp ]; then
-    mkdir -p data
-    pushd data
-    wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
-    echo "utt1 " $PWD/zh.wav > wav.scp
-    popd 
-fi
-
-ckpt_dir=./data/model
-model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
-
-ds2-model-ol-test \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --param_path=$model_dir/avg_1.jit.pdiparams
-
diff --git a/speechx/examples/ds2_ol/nnet/valgrind.sh b/speechx/examples/ds2_ol/nnet/valgrind.sh
deleted file mode 100755
index 2a08c6082..000000000
--- a/speechx/examples/ds2_ol/nnet/valgrind.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-# this script is for memory check, so please run ./run.sh first.
-
-set +x
-set -e
-
-. ./path.sh
-
-if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
-  echo "please install valgrind in the speechx tools dir.\n" 
-  exit 1
-fi
-
-model_dir=../paddle_asr_model
-
-valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
-  pp-model-test \
-  --model_path=$model_dir/avg_1.jit.pdmodel \
-  --param_path=$model_dir/avg_1.jit.pdparams
\ No newline at end of file
diff --git a/utils/README.md b/utils/README.md
index 163be850f..db2064efa 100644
--- a/utils/README.md
+++ b/utils/README.md
@@ -1,4 +1,4 @@
 # Utils
 
 * [kaldi utils](https://github.com/kaldi-asr/kaldi/blob/cbed4ff688/egs/wsj/s5/utils)
-* [espnet utils)(https://github.com/espnet/espnet/tree/master/utils)
+* [espnet utils](https://github.com/espnet/espnet/tree/master/utils)

From 1d01c5b525c29b000148aef21380e08b96c5eb33 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Fri, 27 May 2022 10:11:21 +0800
Subject: [PATCH 20/40] fix frontend/audio/cmakelist

---
 speechx/speechx/frontend/audio/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index 86faf8ced..0aec68faf 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -11,10 +11,14 @@ add_library(frontend STATIC
 )
 target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)
 
-#target_link_libraries(${bin_name} frontend kaldi-util kaldi-feat-common gflags glog)
+
+
+set(bin_name cmvn_json2kaldi_main)
+add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog)
 
 set(BINS 
-  cmvn_json2kaldi_main
   compute_linear_spectrogram_main
   compute_fbank_main
 )

From e23b173c844f9de0895671933211dbc9cea97bb2 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Fri, 27 May 2022 17:18:35 +0800
Subject: [PATCH 21/40] add partial result

---
 .../ds2_ol/websocket/websocket_server.sh      |  2 +-
 speechx/speechx/decoder/ctc_tlg_decoder.cc    | 20 +++++++++++++++++++
 speechx/speechx/decoder/ctc_tlg_decoder.h     |  1 +
 speechx/speechx/decoder/recognizer.cc         |  4 ++++
 speechx/speechx/decoder/recognizer.h          |  1 +
 speechx/speechx/websocket/websocket_client.cc |  3 +++
 speechx/speechx/websocket/websocket_client.h  |  2 ++
 .../websocket/websocket_client_main.cc        |  1 -
 speechx/speechx/websocket/websocket_server.cc |  5 +++--
 speechx/speechx/websocket/websocket_server.h  |  1 -
 10 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/speechx/examples/ds2_ol/websocket/websocket_server.sh b/speechx/examples/ds2_ol/websocket/websocket_server.sh
index fc57e326f..f798dfd41 100755
--- a/speechx/examples/ds2_ol/websocket/websocket_server.sh
+++ b/speechx/examples/ds2_ol/websocket/websocket_server.sh
@@ -45,7 +45,7 @@ export GLOG_logtostderr=1
 
 # 3. gen cmvn 
 cmvn=$data/cmvn.ark
-cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
 
 
 wfst=$data/wfst/
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc
index 02e643165..3f8bdd5a7 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc
@@ -47,6 +47,26 @@ void TLGDecoder::Reset() {
     return;
 }
 
+std::string TLGDecoder::GetPartialResult() {
+    if (frame_decoded_size_ == 0) {
+        // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
+        // BestPathEnd if no frames were decoded.")
+        return std::string("");
+    }
+    kaldi::Lattice lat;
+    kaldi::LatticeWeight weight;
+    std::vector<int> alignment;
+    std::vector<int> words_id;
+    decoder_->GetBestPath(&lat, false);
+    fst::GetLinearSymbolSequence(lat, &alignment, &words_id, &weight);
+    std::string words;
+    for (int32 idx = 0; idx < words_id.size(); ++idx) {
+        std::string word = word_symbol_table_->Find(words_id[idx]);
+        words += word;
+    }
+    return words; 
+}
+
 std::string TLGDecoder::GetFinalBestPath() {
     if (frame_decoded_size_ == 0) {
         // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call
diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h
index 361c44af5..1ac46ac64 100644
--- a/speechx/speechx/decoder/ctc_tlg_decoder.h
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.h
@@ -38,6 +38,7 @@ class TLGDecoder {
     std::string GetBestPath();
     std::vector<std::pair<double, std::string>> GetNBestPath();
     std::string GetFinalBestPath();
+    std::string GetPartialResult();
     int NumFrameDecoded();
     int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
                           std::vector<std::string>& nbest_words);
diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/decoder/recognizer.cc
index 2c90ada99..44c3911c9 100644
--- a/speechx/speechx/decoder/recognizer.cc
+++ b/speechx/speechx/decoder/recognizer.cc
@@ -44,6 +44,10 @@ std::string Recognizer::GetFinalResult() {
     return decoder_->GetFinalBestPath();
 }
 
+std::string Recognizer::GetPartialResult() {
+    return decoder_->GetPartialResult();
+}
+
 void Recognizer::SetFinished() {
     feature_pipeline_->SetFinished();
     input_finished_ = true;
diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/decoder/recognizer.h
index 9a7e7d11e..35e1e1676 100644
--- a/speechx/speechx/decoder/recognizer.h
+++ b/speechx/speechx/decoder/recognizer.h
@@ -43,6 +43,7 @@ class Recognizer {
     void Accept(const kaldi::Vector<kaldi::BaseFloat>& waves);
     void Decode();
     std::string GetFinalResult();
+    std::string GetPartialResult();
     void SetFinished();
     bool IsFinished();
     void Reset();
diff --git a/speechx/speechx/websocket/websocket_client.cc b/speechx/speechx/websocket/websocket_client.cc
index 6bd930b85..3a8523057 100644
--- a/speechx/speechx/websocket/websocket_client.cc
+++ b/speechx/speechx/websocket/websocket_client.cc
@@ -67,6 +67,9 @@ void WebSocketClient::ReadLoopFunc() {
             if (obj["type"] == "final_result") {
                 result_ = obj["result"].as_string().c_str();
             }
+            if (obj["type"] == "partial_result") {
+                partial_result_ = obj["partial_result"].as_string().c_str();
+            }
             if (obj["type"] == "speech_end") {
                 done_ = true;
                 break;
diff --git a/speechx/speechx/websocket/websocket_client.h b/speechx/speechx/websocket/websocket_client.h
index ac0aed310..7d05448e6 100644
--- a/speechx/speechx/websocket/websocket_client.h
+++ b/speechx/speechx/websocket/websocket_client.h
@@ -41,11 +41,13 @@ class WebSocketClient {
     void SendDataEnd();
     bool Done() const { return done_; }
     std::string GetResult() { return result_; }
+    std::string GetPartialResult() { return partial_result_; }
 
   private:
     void Connect();
     std::string host_;
     std::string result_;
+    std::string partial_result_;
     int port_;
     bool done_ = false;
     asio::io_context ioc_;
diff --git a/speechx/speechx/websocket/websocket_client_main.cc b/speechx/speechx/websocket/websocket_client_main.cc
index df658b0a2..7ad36e3a5 100644
--- a/speechx/speechx/websocket/websocket_client_main.cc
+++ b/speechx/speechx/websocket/websocket_client_main.cc
@@ -59,7 +59,6 @@ int main(int argc, char* argv[]) {
             client.SendBinaryData(wav_chunk.data(),
                                   wav_chunk.size() * sizeof(int16));
 
-
             sample_offset += cur_chunk_size;
             LOG(INFO) << "Send " << cur_chunk_size << " samples";
             std::this_thread::sleep_for(
diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/websocket/websocket_server.cc
index 28c9eca4e..569f5378e 100644
--- a/speechx/speechx/websocket/websocket_server.cc
+++ b/speechx/speechx/websocket/websocket_server.cc
@@ -75,9 +75,10 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
     CHECK(recognizer_ != nullptr);
     recognizer_->Accept(pcm_data);
 
-    // TODO: return lpartial result
+    std::string partial_result = recognizer_->GetPartialResult();
+    
     json::value rv = {
-        {"status", "ok"}, {"type", "partial_result"}, {"result", "TODO"}};
+        {"status", "ok"}, {"type", "partial_result"}, {"partial_result", partial_result}};
     ws_.text(true);
     ws_.write(asio::buffer(json::serialize(rv)));
 }
diff --git a/speechx/speechx/websocket/websocket_server.h b/speechx/speechx/websocket/websocket_server.h
index 9ea88282e..009fc42ed 100644
--- a/speechx/speechx/websocket/websocket_server.h
+++ b/speechx/speechx/websocket/websocket_server.h
@@ -44,7 +44,6 @@ class ConnectionHandler {
     void OnFinish();
     void OnSpeechData(const beast::flat_buffer& buffer);
     void OnError(const std::string& message);
-    void OnPartialResult(const std::string& result);
     void OnFinalResult(const std::string& result);
     void DecodeThreadFunc();
     std::string SerializeResult(bool finish);

From 977d51473fe8fccbfead4e874dabf19036cd66ed Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Fri, 27 May 2022 17:51:12 +0800
Subject: [PATCH 22/40] fix websocket key, mv websocket into protocol, test=doc

---
 speechx/examples/ds2_ol/websocket/path.sh                     | 2 +-
 speechx/speechx/CMakeLists.txt                                | 4 ++--
 speechx/speechx/protocol/CMakeLists.txt                       | 3 +++
 speechx/speechx/{ => protocol}/websocket/CMakeLists.txt       | 0
 speechx/speechx/{ => protocol}/websocket/websocket_client.cc  | 2 +-
 speechx/speechx/{ => protocol}/websocket/websocket_client.h   | 4 ++--
 .../speechx/{ => protocol}/websocket/websocket_client_main.cc | 0
 speechx/speechx/{ => protocol}/websocket/websocket_server.cc  | 2 +-
 speechx/speechx/{ => protocol}/websocket/websocket_server.h   | 0
 .../speechx/{ => protocol}/websocket/websocket_server_main.cc | 0
 10 files changed, 10 insertions(+), 7 deletions(-)
 rename speechx/speechx/{ => protocol}/websocket/CMakeLists.txt (100%)
 rename speechx/speechx/{ => protocol}/websocket/websocket_client.cc (97%)
 rename speechx/speechx/{ => protocol}/websocket/websocket_client.h (93%)
 rename speechx/speechx/{ => protocol}/websocket/websocket_client_main.cc (100%)
 rename speechx/speechx/{ => protocol}/websocket/websocket_server.cc (98%)
 rename speechx/speechx/{ => protocol}/websocket/websocket_server.h (100%)
 rename speechx/speechx/{ => protocol}/websocket/websocket_server_main.cc (100%)

diff --git a/speechx/examples/ds2_ol/websocket/path.sh b/speechx/examples/ds2_ol/websocket/path.sh
index 3ad032031..d25e88a27 100755
--- a/speechx/examples/ds2_ol/websocket/path.sh
+++ b/speechx/examples/ds2_ol/websocket/path.sh
@@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 
 export LC_AL=C
 
-SPEECHX_BIN=$SPEECHX_BUILD/websocket
+SPEECHX_BIN=$SPEECHX_BUILD/protocol/websocket
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt
index a9a8a398d..c8e21d486 100644
--- a/speechx/speechx/CMakeLists.txt
+++ b/speechx/speechx/CMakeLists.txt
@@ -34,9 +34,9 @@ add_subdirectory(decoder)
 
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}
-${CMAKE_CURRENT_SOURCE_DIR}/websocket
+${CMAKE_CURRENT_SOURCE_DIR}/protocol
 )
-add_subdirectory(websocket)
+add_subdirectory(protocol)
 
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}
diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt
index e69de29bb..98b2f38b4 100644
--- a/speechx/speechx/protocol/CMakeLists.txt
+++ b/speechx/speechx/protocol/CMakeLists.txt
@@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_subdirectory(websocket)
diff --git a/speechx/speechx/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt
similarity index 100%
rename from speechx/speechx/websocket/CMakeLists.txt
rename to speechx/speechx/protocol/websocket/CMakeLists.txt
diff --git a/speechx/speechx/websocket/websocket_client.cc b/speechx/speechx/protocol/websocket/websocket_client.cc
similarity index 97%
rename from speechx/speechx/websocket/websocket_client.cc
rename to speechx/speechx/protocol/websocket/websocket_client.cc
index 3a8523057..60e06db63 100644
--- a/speechx/speechx/websocket/websocket_client.cc
+++ b/speechx/speechx/protocol/websocket/websocket_client.cc
@@ -68,7 +68,7 @@ void WebSocketClient::ReadLoopFunc() {
                 result_ = obj["result"].as_string().c_str();
             }
             if (obj["type"] == "partial_result") {
-                partial_result_ = obj["partial_result"].as_string().c_str();
+                partial_result_ = obj["result"].as_string().c_str();
             }
             if (obj["type"] == "speech_end") {
                 done_ = true;
diff --git a/speechx/speechx/websocket/websocket_client.h b/speechx/speechx/protocol/websocket/websocket_client.h
similarity index 93%
rename from speechx/speechx/websocket/websocket_client.h
rename to speechx/speechx/protocol/websocket/websocket_client.h
index 7d05448e6..8635501a8 100644
--- a/speechx/speechx/websocket/websocket_client.h
+++ b/speechx/speechx/protocol/websocket/websocket_client.h
@@ -40,8 +40,8 @@ class WebSocketClient {
     void SendEndSignal();
     void SendDataEnd();
     bool Done() const { return done_; }
-    std::string GetResult() { return result_; }
-    std::string GetPartialResult() { return partial_result_; }
+    std::string GetResult() const { return result_; } 
+    std::string GetPartialResult() const { return partial_result_;}
 
   private:
     void Connect();
diff --git a/speechx/speechx/websocket/websocket_client_main.cc b/speechx/speechx/protocol/websocket/websocket_client_main.cc
similarity index 100%
rename from speechx/speechx/websocket/websocket_client_main.cc
rename to speechx/speechx/protocol/websocket/websocket_client_main.cc
diff --git a/speechx/speechx/websocket/websocket_server.cc b/speechx/speechx/protocol/websocket/websocket_server.cc
similarity index 98%
rename from speechx/speechx/websocket/websocket_server.cc
rename to speechx/speechx/protocol/websocket/websocket_server.cc
index 569f5378e..a1abd98e6 100644
--- a/speechx/speechx/websocket/websocket_server.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server.cc
@@ -78,7 +78,7 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
     std::string partial_result = recognizer_->GetPartialResult();
     
     json::value rv = {
-        {"status", "ok"}, {"type", "partial_result"}, {"partial_result", partial_result}};
+        {"status", "ok"}, {"type", "partial_result"}, {"result", partial_result}};
     ws_.text(true);
     ws_.write(asio::buffer(json::serialize(rv)));
 }
diff --git a/speechx/speechx/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h
similarity index 100%
rename from speechx/speechx/websocket/websocket_server.h
rename to speechx/speechx/protocol/websocket/websocket_server.h
diff --git a/speechx/speechx/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc
similarity index 100%
rename from speechx/speechx/websocket/websocket_server_main.cc
rename to speechx/speechx/protocol/websocket/websocket_server_main.cc

From 62c50e0060f09c226568107f560613e99356b5d6 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Fri, 27 May 2022 10:18:45 +0000
Subject: [PATCH 23/40] deprecate the 1.8x model, test=doc

---
 docs/source/released_model.md                 |   8 -
 examples/other/1xt2x/.gitignore               |   1 -
 examples/other/1xt2x/README.md                |  19 -
 examples/other/1xt2x/aishell/.gitignore       |   5 -
 .../1xt2x/aishell/conf/augmentation.json      |   1 -
 .../other/1xt2x/aishell/conf/deepspeech2.yaml |  65 ---
 .../1xt2x/aishell/conf/tuning/decode.yaml     |  10 -
 examples/other/1xt2x/aishell/local/data.sh    |  69 ----
 .../1xt2x/aishell/local/download_lm_ch.sh     |  23 --
 .../1xt2x/aishell/local/download_model.sh     |  25 --
 examples/other/1xt2x/aishell/local/test.sh    |  36 --
 examples/other/1xt2x/aishell/path.sh          |  17 -
 examples/other/1xt2x/aishell/run.sh           |  29 --
 examples/other/1xt2x/baidu_en8k/.gitignore    |   5 -
 .../1xt2x/baidu_en8k/conf/augmentation.json   |   1 -
 .../1xt2x/baidu_en8k/conf/deepspeech2.yaml    |  64 ---
 .../1xt2x/baidu_en8k/conf/tuning/decode.yaml  |  10 -
 examples/other/1xt2x/baidu_en8k/local/data.sh |  85 ----
 .../1xt2x/baidu_en8k/local/download_lm_en.sh  |  22 --
 .../1xt2x/baidu_en8k/local/download_model.sh  |  25 --
 examples/other/1xt2x/baidu_en8k/local/test.sh |  36 --
 examples/other/1xt2x/baidu_en8k/path.sh       |  17 -
 examples/other/1xt2x/baidu_en8k/run.sh        |  29 --
 examples/other/1xt2x/librispeech/.gitignore   |   5 -
 .../1xt2x/librispeech/conf/augmentation.json  |   1 -
 .../1xt2x/librispeech/conf/deepspeech2.yaml   |  64 ---
 .../1xt2x/librispeech/conf/tuning/decode.yaml |  10 -
 .../other/1xt2x/librispeech/local/data.sh     |  83 ----
 .../1xt2x/librispeech/local/download_lm_en.sh |  22 --
 .../1xt2x/librispeech/local/download_model.sh |  25 --
 .../other/1xt2x/librispeech/local/test.sh     |  36 --
 examples/other/1xt2x/librispeech/path.sh      |  16 -
 examples/other/1xt2x/librispeech/run.sh       |  28 --
 .../other/1xt2x/src_deepspeech2x/__init__.py  | 370 ------------------
 .../other/1xt2x/src_deepspeech2x/bin/test.py  |  59 ---
 .../1xt2x/src_deepspeech2x/models/__init__.py |  13 -
 .../src_deepspeech2x/models/ds2/__init__.py   |  17 -
 .../models/ds2/deepspeech2.py                 | 275 -------------
 .../1xt2x/src_deepspeech2x/models/ds2/rnn.py  | 334 ----------------
 .../1xt2x/src_deepspeech2x/test_model.py      | 357 -----------------
 40 files changed, 2317 deletions(-)
 delete mode 100644 examples/other/1xt2x/.gitignore
 delete mode 100644 examples/other/1xt2x/README.md
 delete mode 100644 examples/other/1xt2x/aishell/.gitignore
 delete mode 100644 examples/other/1xt2x/aishell/conf/augmentation.json
 delete mode 100644 examples/other/1xt2x/aishell/conf/deepspeech2.yaml
 delete mode 100644 examples/other/1xt2x/aishell/conf/tuning/decode.yaml
 delete mode 100755 examples/other/1xt2x/aishell/local/data.sh
 delete mode 100755 examples/other/1xt2x/aishell/local/download_lm_ch.sh
 delete mode 100644 examples/other/1xt2x/aishell/local/download_model.sh
 delete mode 100755 examples/other/1xt2x/aishell/local/test.sh
 delete mode 100644 examples/other/1xt2x/aishell/path.sh
 delete mode 100755 examples/other/1xt2x/aishell/run.sh
 delete mode 100644 examples/other/1xt2x/baidu_en8k/.gitignore
 delete mode 100644 examples/other/1xt2x/baidu_en8k/conf/augmentation.json
 delete mode 100644 examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
 delete mode 100644 examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
 delete mode 100755 examples/other/1xt2x/baidu_en8k/local/data.sh
 delete mode 100755 examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh
 delete mode 100644 examples/other/1xt2x/baidu_en8k/local/download_model.sh
 delete mode 100755 examples/other/1xt2x/baidu_en8k/local/test.sh
 delete mode 100644 examples/other/1xt2x/baidu_en8k/path.sh
 delete mode 100755 examples/other/1xt2x/baidu_en8k/run.sh
 delete mode 100644 examples/other/1xt2x/librispeech/.gitignore
 delete mode 100644 examples/other/1xt2x/librispeech/conf/augmentation.json
 delete mode 100644 examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
 delete mode 100644 examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
 delete mode 100755 examples/other/1xt2x/librispeech/local/data.sh
 delete mode 100755 examples/other/1xt2x/librispeech/local/download_lm_en.sh
 delete mode 100644 examples/other/1xt2x/librispeech/local/download_model.sh
 delete mode 100755 examples/other/1xt2x/librispeech/local/test.sh
 delete mode 100644 examples/other/1xt2x/librispeech/path.sh
 delete mode 100755 examples/other/1xt2x/librispeech/run.sh
 delete mode 100644 examples/other/1xt2x/src_deepspeech2x/__init__.py
 delete mode 100644 examples/other/1xt2x/src_deepspeech2x/bin/test.py
 delete mode 100644 examples/other/1xt2x/src_deepspeech2x/models/__init__.py
 delete mode 100644 examples/other/1xt2x/src_deepspeech2x/models/ds2/__init__.py
 delete mode 100644 examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
 delete mode 100644 examples/other/1xt2x/src_deepspeech2x/models/ds2/rnn.py
 delete mode 100644 examples/other/1xt2x/src_deepspeech2x/test_model.py

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 74435ae1a..19a47e6c1 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -88,11 +88,3 @@ PANN | VoxCeleb| [voxceleb_ecapatdnn](https://github.com/PaddlePaddle/PaddleSpee
 Model Type | Dataset| Example Link | Pretrained Models
 :-------------:| :------------:| :-----: | :-----:
 Ernie Linear | IWLST2012_zh |[iwslt2012_punc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/iwslt2012/punc0)|[ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip)
-
-## Speech Recognition Model  from paddle 1.8
-
-| Acoustic Model |Training Data| Token-based | Size | Descriptions | CER | WER | Hours of speech |
-| :-----:| :-----:  |  :-----:  |  :-----:  | :-----:  |  :-----: | :-----:  | :-----: |
-| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) |        Aishell Dataset  | Char-based  | 234 MB | 2 Conv + 3 bidirectional GRU layers  | 0.0804 | —  | 151 h  |
-| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) |      Librispeech Dataset | Word-based  | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | —  | 0.0685 | 960 h |
-| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based  | 273 MB | 2 Conv + 3 bidirectional GRU layers   |—  | 0.0541 | 8628 h|
diff --git a/examples/other/1xt2x/.gitignore b/examples/other/1xt2x/.gitignore
deleted file mode 100644
index a9a5aecf4..000000000
--- a/examples/other/1xt2x/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-tmp
diff --git a/examples/other/1xt2x/README.md b/examples/other/1xt2x/README.md
deleted file mode 100644
index 49f850d26..000000000
--- a/examples/other/1xt2x/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# 1xt2x
-
-Convert Deepspeech 1.8 released model to 2.x.
-
-## Model source directory
-* Deepspeech2x
-
-## Expriment directory
-* aishell
-* librispeech
-* baidu_en8k
-
-# The released model
-
-Acoustic Model |  Training Data | Hours of Speech | Token-based | CER | WER
-:-------------:| :------------:| :---------------: | :---------: | :---: | :----:
-Ds2 Offline Aishell 1xt2x model| Aishell Dataset | 151 h | Char-based | 0.080447 |
-Ds2 Offline Librispeech 1xt2x model | Librispeech Dataset | 960 h | Word-based | | 0.068548
-Ds2 Offline Baidu en8k 1x2x model | Baidu Internal English Dataset | 8628 h |Word-based | | 0.054112
diff --git a/examples/other/1xt2x/aishell/.gitignore b/examples/other/1xt2x/aishell/.gitignore
deleted file mode 100644
index 3631e544a..000000000
--- a/examples/other/1xt2x/aishell/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-exp
-data
-*log
-tmp
-nohup*
diff --git a/examples/other/1xt2x/aishell/conf/augmentation.json b/examples/other/1xt2x/aishell/conf/augmentation.json
deleted file mode 100644
index fe51488c7..000000000
--- a/examples/other/1xt2x/aishell/conf/augmentation.json
+++ /dev/null
@@ -1 +0,0 @@
-[]
diff --git a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml
deleted file mode 100644
index c2db2c7c2..000000000
--- a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# https://yaml.org/type/float.html
-###########################################
-#                   Data                  #
-###########################################
-train_manifest: data/manifest.train
-dev_manifest: data/manifest.dev
-test_manifest: data/manifest.test
-min_input_len: 0.0
-max_input_len: 27.0 # second
-min_output_len: 0.0
-max_output_len: .inf
-min_output_input_ratio: 0.00
-max_output_input_ratio: .inf
-
-###########################################
-#              Dataloader                 #
-###########################################
-batch_size: 64 # one gpu
-mean_std_filepath: data/mean_std.npz
-unit_type: char
-vocab_filepath: data/vocab.txt 
-augmentation_config: conf/augmentation.json
-random_seed: 0
-spm_model_prefix: 
-spectrum_type: linear
-feat_dim: 
-delta_delta: False
-stride_ms: 10.0
-window_ms: 20.0
-n_fft: None
-max_freq: None
-target_sample_rate: 16000
-use_dB_normalization: True
-target_dB: -20
-dither: 1.0
-keep_transcription_text: False
-sortagrad: True
-shuffle_method: batch_shuffle
-num_workers: 2
-
-############################################
-#           Network Architecture           #
-############################################
-num_conv_layers: 2
-num_rnn_layers: 3
-rnn_layer_size: 1024
-use_gru: True 
-share_rnn_weights: False
-blank_id: 4333
-
-###########################################
-#                Training                 #
-###########################################
-n_epoch: 80
-accum_grad: 1
-lr: 2e-3
-lr_decay: 0.83
-weight_decay: 1e-06
-global_grad_clip: 3.0
-log_interval: 100
-checkpoint:
-  kbest_n: 50
-  latest_n: 5
-  
-  
diff --git a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml
deleted file mode 100644
index b5283a934..000000000
--- a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-decode_batch_size: 32
-error_rate_type: cer 
-decoding_method: ctc_beam_search
-lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
-alpha: 2.6
-beta: 5.0
-beam_size: 300
-cutoff_prob: 0.99
-cutoff_top_n: 40
-num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh
deleted file mode 100755
index a9d5b1412..000000000
--- a/examples/other/1xt2x/aishell/local/data.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-if [ $# != 1 ];then
-    echo "usage: ${0} ckpt_dir"
-    exit -1
-fi
-
-ckpt_dir=$1
-
-stage=-1
-stop_stage=100
-
-source ${MAIN_ROOT}/utils/parse_options.sh
-
-mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/dataset
-mkdir -p ${TARGET_DIR}
-
-bash local/download_model.sh ${ckpt_dir}
-if [ $? -ne 0 ]; then
-   exit 1
-fi
-
-cd ${ckpt_dir}
-tar xzvf aishell_model_v1.8_to_v2.x.tar.gz
-cd -
-mv ${ckpt_dir}/mean_std.npz data/
-mv ${ckpt_dir}/vocab.txt data/
-
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    # download data, generate manifests
-    python3 ${TARGET_DIR}/aishell/aishell.py \
-    --manifest_prefix="data/manifest" \
-    --target_dir="${TARGET_DIR}/aishell"
-
-    if [ $? -ne 0 ]; then
-        echo "Prepare Aishell failed. Terminated."
-        exit 1
-    fi
-
-    for dataset in train dev test; do
-        mv data/manifest.${dataset} data/manifest.${dataset}.raw
-    done
-fi
-
-
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # format manifest with tokenids, vocab size
-    for dataset in train dev test; do
-    {
-        python3 ${MAIN_ROOT}/utils/format_data.py \
-                --cmvn_path "data/mean_std.npz" \
-                --unit_type "char" \
-                --vocab_path="data/vocab.txt" \
-                --manifest_path="data/manifest.${dataset}.raw" \
-                --output_path="data/manifest.${dataset}"
-
-        if [ $? -ne 0 ]; then
-                echo "Formt mnaifest failed. Terminated."
-                exit 1
-        fi
-    } &
-    done
-    wait
-fi
-
-echo "Aishell data preparation done."
-exit 0
diff --git a/examples/other/1xt2x/aishell/local/download_lm_ch.sh b/examples/other/1xt2x/aishell/local/download_lm_ch.sh
deleted file mode 100755
index 47153f4b6..000000000
--- a/examples/other/1xt2x/aishell/local/download_lm_ch.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-DIR=data/lm
-mkdir -p ${DIR}
-
-URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
-MD5="29e02312deb2e59b3c8686c7966d4fe3"
-TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
-
-
-echo "Start downloading the language model. The language model is large, please wait for a moment ..."
-download $URL $MD5 $TARGET > /dev/null 2>&1
-if [ $? -ne 0 ]; then
-    echo "Fail to download the language model!"
-    exit 1
-else
-    echo "Download the language model sucessfully"
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/aishell/local/download_model.sh b/examples/other/1xt2x/aishell/local/download_model.sh
deleted file mode 100644
index ffa2f8101..000000000
--- a/examples/other/1xt2x/aishell/local/download_model.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /usr/bin/env bash
-
-if [ $# != 1 ];then
-    echo "usage: ${0} ckpt_dir"
-    exit -1
-fi
-
-ckpt_dir=$1
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-URL='https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz'
-MD5=87e7577d4bea737dbf3e8daab37aa808
-TARGET=${ckpt_dir}/aishell_model_v1.8_to_v2.x.tar.gz
-
-
-echo "Download Aishell model ..."
-download $URL $MD5 $TARGET
-if [ $? -ne 0 ]; then
-    echo "Fail to download Aishell model!"
-    exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh
deleted file mode 100755
index 463593ef3..000000000
--- a/examples/other/1xt2x/aishell/local/test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-if [ $# != 4 ];then
-    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
-    exit -1
-fi
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-echo "using $ngpu gpus..."
-
-config_path=$1
-decode_config_path=$2
-ckpt_prefix=$3
-model_type=$4
-
-# download language model
-bash local/download_lm_ch.sh
-if [ $? -ne 0 ]; then
-   exit 1
-fi
-
-python3 -u ${BIN_DIR}/test.py \
---ngpu ${ngpu} \
---config ${config_path} \
---decode_cfg ${decode_config_path} \
---result_file ${ckpt_prefix}.rsl \
---checkpoint_path ${ckpt_prefix} \
---model_type ${model_type}
-
-if [ $? -ne 0 ]; then
-    echo "Failed in evaluation!"
-    exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/aishell/path.sh b/examples/other/1xt2x/aishell/path.sh
deleted file mode 100644
index ce44e65cb..000000000
--- a/examples/other/1xt2x/aishell/path.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-export MAIN_ROOT=`realpath ${PWD}/../../../../`
-export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-export PYTHONPATH=${LOCAL_DEEPSPEECH2}:${PYTHONPATH}
-
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
-
-MODEL=deepspeech2
-export BIN_DIR=${LOCAL_DEEPSPEECH2}/src_deepspeech2x/bin
-echo "BIN_DIR "${BIN_DIR}
diff --git a/examples/other/1xt2x/aishell/run.sh b/examples/other/1xt2x/aishell/run.sh
deleted file mode 100755
index 89a634119..000000000
--- a/examples/other/1xt2x/aishell/run.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-set -e
-source path.sh
-
-stage=0
-stop_stage=100
-conf_path=conf/deepspeech2.yaml
-decode_conf_path=conf/tuning/decode.yaml
-avg_num=1
-model_type=offline
-gpus=2
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
-v18_ckpt=aishell_v1.8
-ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
-echo "checkpoint name ${ckpt}"
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # prepare data
-    mkdir -p exp/${ckpt}/checkpoints
-    bash ./local/data.sh exp/${ckpt}/checkpoints || exit -1
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
-fi
-
diff --git a/examples/other/1xt2x/baidu_en8k/.gitignore b/examples/other/1xt2x/baidu_en8k/.gitignore
deleted file mode 100644
index 3631e544a..000000000
--- a/examples/other/1xt2x/baidu_en8k/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-exp
-data
-*log
-tmp
-nohup*
diff --git a/examples/other/1xt2x/baidu_en8k/conf/augmentation.json b/examples/other/1xt2x/baidu_en8k/conf/augmentation.json
deleted file mode 100644
index fe51488c7..000000000
--- a/examples/other/1xt2x/baidu_en8k/conf/augmentation.json
+++ /dev/null
@@ -1 +0,0 @@
-[]
diff --git a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
deleted file mode 100644
index 0c08fbc63..000000000
--- a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# https://yaml.org/type/float.html
-###########################################
-#                   Data                  #
-###########################################
-train_manifest: data/manifest.train
-dev_manifest: data/manifest.dev
-test_manifest: data/manifest.test-clean
-min_input_len: 0.0
-max_input_len: .inf # second
-min_output_len: 0.0
-max_output_len: .inf
-min_output_input_ratio: 0.00
-max_output_input_ratio: .inf
-
-###########################################
-#              Dataloader                 #
-###########################################
-batch_size: 64 # one gpu
-mean_std_filepath: data/mean_std.npz
-unit_type: char
-vocab_filepath: data/vocab.txt 
-augmentation_config: conf/augmentation.json
-random_seed: 0
-spm_model_prefix: 
-spectrum_type: linear
-feat_dim: 
-delta_delta: False
-stride_ms: 10.0
-window_ms: 20.0
-n_fft: None
-max_freq: None
-target_sample_rate: 16000
-use_dB_normalization: True
-target_dB: -20
-dither: 1.0
-keep_transcription_text: False
-sortagrad: True
-shuffle_method: batch_shuffle
-num_workers: 2
-
-############################################
-#           Network Architecture           #
-############################################
-num_conv_layers: 2
-num_rnn_layers: 3
-rnn_layer_size: 1024
-use_gru: True
-share_rnn_weights: False
-blank_id: 28
-
-###########################################
-#                Training                 #
-###########################################
-n_epoch: 80
-accum_grad: 1
-lr: 2e-3
-lr_decay: 0.83
-weight_decay: 1e-06
-global_grad_clip: 3.0
-log_interval: 100
-checkpoint:
-  kbest_n: 50
-  latest_n: 5
-
diff --git a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
deleted file mode 100644
index f52dde320..000000000
--- a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-decode_batch_size: 32
-error_rate_type: wer 
-decoding_method: ctc_beam_search
-lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-alpha: 1.4
-beta: 0.35
-beam_size: 500
-cutoff_prob: 1.0
-cutoff_top_n: 40
-num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh
deleted file mode 100755
index 9b017324d..000000000
--- a/examples/other/1xt2x/baidu_en8k/local/data.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash
-if [ $# != 1 ];then
-    echo "usage: ${0} ckpt_dir"
-    exit -1
-fi
-
-ckpt_dir=$1
-
-stage=-1
-stop_stage=100
-unit_type=char
-
-source ${MAIN_ROOT}/utils/parse_options.sh
-
-mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/dataset
-mkdir -p ${TARGET_DIR}
-
-
-bash local/download_model.sh ${ckpt_dir}
-if [ $? -ne 0 ]; then
-   exit 1
-fi
-
-cd ${ckpt_dir}
-tar xzvf baidu_en8k_v1.8_to_v2.x.tar.gz
-cd -
-mv ${ckpt_dir}/mean_std.npz data/
-mv ${ckpt_dir}/vocab.txt data/
-
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    # download data, generate manifests
-    python3 ${TARGET_DIR}/librispeech/librispeech.py \
-    --manifest_prefix="data/manifest" \
-    --target_dir="${TARGET_DIR}/librispeech" \
-    --full_download="True"
-
-    if [ $? -ne 0 ]; then
-        echo "Prepare LibriSpeech failed. Terminated."
-        exit 1
-    fi
-
-    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-        mv data/manifest.${set} data/manifest.${set}.raw
-    done
-
-    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
-    for set in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set}.raw >> data/manifest.train.raw
-    done
-
-    for set in dev-clean dev-other; do
-        cat data/manifest.${set}.raw >> data/manifest.dev.raw
-    done
-
-    for set in test-clean test-other; do
-        cat data/manifest.${set}.raw >> data/manifest.test.raw
-    done
-fi
-
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # format manifest with tokenids, vocab size
-    for set in train dev test dev-clean dev-other test-clean test-other; do
-    {
-        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --cmvn_path "data/mean_std.npz" \
-        --unit_type ${unit_type} \
-        --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
-
-        if [ $? -ne 0 ]; then
-            echo "Formt mnaifest.${set} failed. Terminated."
-            exit 1
-        fi
-    }&
-    done
-    wait
-fi
-
-echo "LibriSpeech Data preparation done."
-exit 0
-
diff --git a/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh b/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh
deleted file mode 100755
index 390fffc93..000000000
--- a/examples/other/1xt2x/baidu_en8k/local/download_lm_en.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-DIR=data/lm
-mkdir -p ${DIR}
-
-URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
-MD5="099a601759d467cd0a8523ff939819c5"
-TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
-
-echo "Start downloading the language model. The language model is large, please wait for a moment ..."
-download $URL $MD5 $TARGET > /dev/null 2>&1
-if [ $? -ne 0 ]; then
-    echo "Fail to download the language model!"
-    exit 1
-else
-    echo "Download the language model sucessfully"
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/baidu_en8k/local/download_model.sh b/examples/other/1xt2x/baidu_en8k/local/download_model.sh
deleted file mode 100644
index a8fbc31e8..000000000
--- a/examples/other/1xt2x/baidu_en8k/local/download_model.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /usr/bin/env bash
-if [ $# != 1 ];then
-    echo "usage: ${0} ckpt_dir"
-    exit -1
-fi
-
-ckpt_dir=$1
-
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-URL='https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz'
-MD5=c1676be8505cee436e6f312823e9008c
-TARGET=${ckpt_dir}/baidu_en8k_v1.8_to_v2.x.tar.gz
-
-
-echo "Download BaiduEn8k model ..."
-download $URL $MD5 $TARGET
-if [ $? -ne 0 ]; then
-    echo "Fail to download BaiduEn8k model!"
-    exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh
deleted file mode 100755
index ea40046b1..000000000
--- a/examples/other/1xt2x/baidu_en8k/local/test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-if [ $# != 4 ];then
-    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
-    exit -1
-fi
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-echo "using $ngpu gpus..."
-
-config_path=$1
-decode_config_path=$2
-ckpt_prefix=$3
-model_type=$4
-
-# download language model
-bash local/download_lm_en.sh
-if [ $? -ne 0 ]; then
-   exit 1
-fi
-
-python3 -u ${BIN_DIR}/test.py \
---ngpu ${ngpu} \
---config ${config_path} \
---decode_cfg ${decode_config_path} \
---result_file ${ckpt_prefix}.rsl \
---checkpoint_path ${ckpt_prefix} \
---model_type ${model_type}
-
-if [ $? -ne 0 ]; then
-    echo "Failed in evaluation!"
-    exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/baidu_en8k/path.sh b/examples/other/1xt2x/baidu_en8k/path.sh
deleted file mode 100644
index ce44e65cb..000000000
--- a/examples/other/1xt2x/baidu_en8k/path.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-export MAIN_ROOT=`realpath ${PWD}/../../../../`
-export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-export PYTHONPATH=${LOCAL_DEEPSPEECH2}:${PYTHONPATH}
-
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
-
-MODEL=deepspeech2
-export BIN_DIR=${LOCAL_DEEPSPEECH2}/src_deepspeech2x/bin
-echo "BIN_DIR "${BIN_DIR}
diff --git a/examples/other/1xt2x/baidu_en8k/run.sh b/examples/other/1xt2x/baidu_en8k/run.sh
deleted file mode 100755
index 82de56b09..000000000
--- a/examples/other/1xt2x/baidu_en8k/run.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-set -e
-source path.sh
-
-stage=0
-stop_stage=100
-conf_path=conf/deepspeech2.yaml
-decode_conf_path=conf/tuning/decode.yaml
-avg_num=1
-model_type=offline
-gpus=0
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
-v18_ckpt=baidu_en8k_v1.8
-ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
-echo "checkpoint name ${ckpt}"
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # prepare data
-    mkdir -p exp/${ckpt}/checkpoints
-    bash ./local/data.sh exp/${ckpt}/checkpoints || exit -1
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
-fi
-
diff --git a/examples/other/1xt2x/librispeech/.gitignore b/examples/other/1xt2x/librispeech/.gitignore
deleted file mode 100644
index 3631e544a..000000000
--- a/examples/other/1xt2x/librispeech/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-exp
-data
-*log
-tmp
-nohup*
diff --git a/examples/other/1xt2x/librispeech/conf/augmentation.json b/examples/other/1xt2x/librispeech/conf/augmentation.json
deleted file mode 100644
index fe51488c7..000000000
--- a/examples/other/1xt2x/librispeech/conf/augmentation.json
+++ /dev/null
@@ -1 +0,0 @@
-[]
diff --git a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
deleted file mode 100644
index a2a5649ba..000000000
--- a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# https://yaml.org/type/float.html
-###########################################
-#                   Data                  #
-###########################################
-train_manifest: data/manifest.train
-dev_manifest: data/manifest.dev
-test_manifest: data/manifest.test-clean
-min_input_len: 0.0
-max_input_len: 1000.0 # second
-min_output_len: 0.0
-max_output_len: .inf
-min_output_input_ratio: 0.00
-max_output_input_ratio: .inf
-
-###########################################
-#              Dataloader                 #
-###########################################
-batch_size: 64 # one gpu
-mean_std_filepath: data/mean_std.npz
-unit_type: char
-vocab_filepath: data/vocab.txt 
-augmentation_config: conf/augmentation.json
-random_seed: 0
-spm_model_prefix: 
-spectrum_type: linear
-feat_dim: 
-delta_delta: False
-stride_ms: 10.0
-window_ms: 20.0
-n_fft: None
-max_freq: None
-target_sample_rate: 16000
-use_dB_normalization: True
-target_dB: -20
-dither: 1.0
-keep_transcription_text: False
-sortagrad: True
-shuffle_method: batch_shuffle
-num_workers: 2
-
-############################################
-#           Network Architecture           #
-############################################
-num_conv_layers: 2
-num_rnn_layers: 3
-rnn_layer_size: 2048
-use_gru: False
-share_rnn_weights: True
-blank_id: 28
-
-###########################################
-#                Training                 #
-###########################################
-n_epoch: 80
-accum_grad: 1
-lr: 2e-3
-lr_decay: 0.83
-weight_decay: 1e-06
-global_grad_clip: 3.0
-log_interval: 100
-checkpoint:
-  kbest_n: 50
-  latest_n: 5
-
diff --git a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
deleted file mode 100644
index f3b51defe..000000000
--- a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-decode_batch_size: 32
-error_rate_type: wer 
-decoding_method: ctc_beam_search
-lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-alpha: 2.5
-beta: 0.3
-beam_size: 500
-cutoff_prob: 1.0
-cutoff_top_n: 40
-num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh
deleted file mode 100755
index 43b5426d9..000000000
--- a/examples/other/1xt2x/librispeech/local/data.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/bin/bash
-
-if [ $# != 1 ];then
-    echo "usage: ${0} ckpt_dir"
-    exit -1
-fi
-
-ckpt_dir=$1
-
-stage=-1
-stop_stage=100
-unit_type=char
-
-source ${MAIN_ROOT}/utils/parse_options.sh
-
-mkdir -p data
-TARGET_DIR=${MAIN_ROOT}/dataset
-mkdir -p ${TARGET_DIR}
-
-bash local/download_model.sh ${ckpt_dir}
-if [ $? -ne 0 ]; then
-   exit 1
-fi
-
-cd ${ckpt_dir}
-tar xzvf librispeech_v1.8_to_v2.x.tar.gz
-cd -
-mv ${ckpt_dir}/mean_std.npz data/
-mv ${ckpt_dir}/vocab.txt data/
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    # download data, generate manifests
-    python3 ${TARGET_DIR}/librispeech/librispeech.py \
-    --manifest_prefix="data/manifest" \
-    --target_dir="${TARGET_DIR}/librispeech" \
-    --full_download="True"
-
-    if [ $? -ne 0 ]; then
-        echo "Prepare LibriSpeech failed. Terminated."
-        exit 1
-    fi
-
-    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-        mv data/manifest.${set} data/manifest.${set}.raw
-    done
-
-    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
-    for set in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set}.raw >> data/manifest.train.raw
-    done
-
-    for set in dev-clean dev-other; do
-        cat data/manifest.${set}.raw >> data/manifest.dev.raw
-    done
-
-    for set in test-clean test-other; do
-        cat data/manifest.${set}.raw >> data/manifest.test.raw
-    done
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # format manifest with tokenids, vocab size
-    for set in train dev test dev-clean dev-other test-clean test-other; do
-    {
-        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --cmvn_path "data/mean_std.npz" \
-        --unit_type ${unit_type} \
-        --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
-
-        if [ $? -ne 0 ]; then
-            echo "Formt mnaifest.${set} failed. Terminated."
-            exit 1
-        fi
-    }&
-    done
-    wait
-fi
-
-echo "LibriSpeech Data preparation done."
-exit 0
-
diff --git a/examples/other/1xt2x/librispeech/local/download_lm_en.sh b/examples/other/1xt2x/librispeech/local/download_lm_en.sh
deleted file mode 100755
index 390fffc93..000000000
--- a/examples/other/1xt2x/librispeech/local/download_lm_en.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-DIR=data/lm
-mkdir -p ${DIR}
-
-URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
-MD5="099a601759d467cd0a8523ff939819c5"
-TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
-
-echo "Start downloading the language model. The language model is large, please wait for a moment ..."
-download $URL $MD5 $TARGET > /dev/null 2>&1
-if [ $? -ne 0 ]; then
-    echo "Fail to download the language model!"
-    exit 1
-else
-    echo "Download the language model sucessfully"
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/librispeech/local/download_model.sh b/examples/other/1xt2x/librispeech/local/download_model.sh
deleted file mode 100644
index 375d66404..000000000
--- a/examples/other/1xt2x/librispeech/local/download_model.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /usr/bin/env bash
-
-if [ $# != 1 ];then
-    echo "usage: ${0} ckpt_dir"
-    exit -1
-fi
-
-ckpt_dir=$1
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz'
-MD5=a06d9aadb560ea113984dc98d67232c8
-TARGET=${ckpt_dir}/librispeech_v1.8_to_v2.x.tar.gz
-
-
-echo "Download LibriSpeech model ..."
-download $URL $MD5 $TARGET
-if [ $? -ne 0 ]; then
-    echo "Fail to download LibriSpeech model!"
-    exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh
deleted file mode 100755
index ea40046b1..000000000
--- a/examples/other/1xt2x/librispeech/local/test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-if [ $# != 4 ];then
-    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
-    exit -1
-fi
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-echo "using $ngpu gpus..."
-
-config_path=$1
-decode_config_path=$2
-ckpt_prefix=$3
-model_type=$4
-
-# download language model
-bash local/download_lm_en.sh
-if [ $? -ne 0 ]; then
-   exit 1
-fi
-
-python3 -u ${BIN_DIR}/test.py \
---ngpu ${ngpu} \
---config ${config_path} \
---decode_cfg ${decode_config_path} \
---result_file ${ckpt_prefix}.rsl \
---checkpoint_path ${ckpt_prefix} \
---model_type ${model_type}
-
-if [ $? -ne 0 ]; then
-    echo "Failed in evaluation!"
-    exit 1
-fi
-
-
-exit 0
diff --git a/examples/other/1xt2x/librispeech/path.sh b/examples/other/1xt2x/librispeech/path.sh
deleted file mode 100644
index e3696ddd5..000000000
--- a/examples/other/1xt2x/librispeech/path.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-export MAIN_ROOT=`realpath ${PWD}/../../../../`
-export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-export PYTHONPATH=${LOCAL_DEEPSPEECH2}:${PYTHONPATH}
-
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
-
-MODEL=deepspeech2
-export BIN_DIR=${LOCAL_DEEPSPEECH2}/src_deepspeech2x/bin
diff --git a/examples/other/1xt2x/librispeech/run.sh b/examples/other/1xt2x/librispeech/run.sh
deleted file mode 100755
index 8b614bbbf..000000000
--- a/examples/other/1xt2x/librispeech/run.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-set -e
-source path.sh
-
-stage=0
-stop_stage=100
-conf_path=conf/deepspeech2.yaml
-decode_conf_path=conf/tuning/decode.yaml
-avg_num=1
-model_type=offline
-gpus=1
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-
-v18_ckpt=librispeech_v1.8
-ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
-echo "checkpoint name ${ckpt}"
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # prepare data
-    mkdir -p exp/${ckpt}/checkpoints
-    bash ./local/data.sh exp/${ckpt}/checkpoints || exit -1
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
-fi
diff --git a/examples/other/1xt2x/src_deepspeech2x/__init__.py b/examples/other/1xt2x/src_deepspeech2x/__init__.py
deleted file mode 100644
index 74be4a254..000000000
--- a/examples/other/1xt2x/src_deepspeech2x/__init__.py
+++ /dev/null
@@ -1,370 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any
-from typing import List
-from typing import Tuple
-from typing import Union
-
-import paddle
-from paddle import nn
-from paddle.fluid import core
-from paddle.nn import functional as F
-
-from paddlespeech.s2t.utils.log import Log
-
-#TODO(Hui Zhang): remove  fluid import
-logger = Log(__name__).getlog()
-
-########### hack logging #############
-logger.warn = logger.warning
-
-########### hack paddle #############
-paddle.half = 'float16'
-paddle.float = 'float32'
-paddle.double = 'float64'
-paddle.short = 'int16'
-paddle.int = 'int32'
-paddle.long = 'int64'
-paddle.uint16 = 'uint16'
-paddle.cdouble = 'complex128'
-
-
-def convert_dtype_to_string(tensor_dtype):
-    """
-    Convert the data type in numpy to the data type in Paddle
-    Args:
-        tensor_dtype(core.VarDesc.VarType): the data type in numpy.
-    Returns:
-        core.VarDesc.VarType: the data type in Paddle.
-    """
-    dtype = tensor_dtype
-    if dtype == core.VarDesc.VarType.FP32:
-        return paddle.float32
-    elif dtype == core.VarDesc.VarType.FP64:
-        return paddle.float64
-    elif dtype == core.VarDesc.VarType.FP16:
-        return paddle.float16
-    elif dtype == core.VarDesc.VarType.INT32:
-        return paddle.int32
-    elif dtype == core.VarDesc.VarType.INT16:
-        return paddle.int16
-    elif dtype == core.VarDesc.VarType.INT64:
-        return paddle.int64
-    elif dtype == core.VarDesc.VarType.BOOL:
-        return paddle.bool
-    elif dtype == core.VarDesc.VarType.BF16:
-        # since there is still no support for bfloat16 in NumPy,
-        # uint16 is used for casting bfloat16
-        return paddle.uint16
-    elif dtype == core.VarDesc.VarType.UINT8:
-        return paddle.uint8
-    elif dtype == core.VarDesc.VarType.INT8:
-        return paddle.int8
-    elif dtype == core.VarDesc.VarType.COMPLEX64:
-        return paddle.complex64
-    elif dtype == core.VarDesc.VarType.COMPLEX128:
-        return paddle.complex128
-    else:
-        raise ValueError("Not supported tensor dtype %s" % dtype)
-
-
-if not hasattr(paddle, 'softmax'):
-    logger.warn("register user softmax to paddle, remove this when fixed!")
-    setattr(paddle, 'softmax', paddle.nn.functional.softmax)
-
-if not hasattr(paddle, 'log_softmax'):
-    logger.warn("register user log_softmax to paddle, remove this when fixed!")
-    setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)
-
-if not hasattr(paddle, 'sigmoid'):
-    logger.warn("register user sigmoid to paddle, remove this when fixed!")
-    setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
-
-if not hasattr(paddle, 'log_sigmoid'):
-    logger.warn("register user log_sigmoid to paddle, remove this when fixed!")
-    setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)
-
-if not hasattr(paddle, 'relu'):
-    logger.warn("register user relu to paddle, remove this when fixed!")
-    setattr(paddle, 'relu', paddle.nn.functional.relu)
-
-
-def cat(xs, dim=0):
-    return paddle.concat(xs, axis=dim)
-
-
-if not hasattr(paddle, 'cat'):
-    logger.warn(
-        "override cat of paddle if exists or register, remove this when fixed!")
-    paddle.cat = cat
-
-
-########### hack paddle.Tensor #############
-def item(x: paddle.Tensor):
-    return x.numpy().item()
-
-
-if not hasattr(paddle.Tensor, 'item'):
-    logger.warn(
-        "override item of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.item = item
-
-
-def func_long(x: paddle.Tensor):
-    return paddle.cast(x, paddle.long)
-
-
-if not hasattr(paddle.Tensor, 'long'):
-    logger.warn(
-        "override long of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.long = func_long
-
-if not hasattr(paddle.Tensor, 'numel'):
-    logger.warn(
-        "override numel of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.numel = paddle.numel
-
-
-def new_full(x: paddle.Tensor,
-             size: Union[List[int], Tuple[int], paddle.Tensor],
-             fill_value: Union[float, int, bool, paddle.Tensor],
-             dtype=None):
-    return paddle.full(size, fill_value, dtype=x.dtype)
-
-
-if not hasattr(paddle.Tensor, 'new_full'):
-    logger.warn(
-        "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.new_full = new_full
-
-
-def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
-    if convert_dtype_to_string(xs.dtype) == paddle.bool:
-        xs = xs.astype(paddle.int)
-    return xs.equal(
-        paddle.to_tensor(
-            ys, dtype=convert_dtype_to_string(xs.dtype), place=xs.place))
-
-
-if not hasattr(paddle.Tensor, 'eq'):
-    logger.warn(
-        "override eq of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.eq = eq
-
-if not hasattr(paddle, 'eq'):
-    logger.warn(
-        "override eq of paddle if exists or register, remove this when fixed!")
-    paddle.eq = eq
-
-
-def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
-    return xs
-
-
-if not hasattr(paddle.Tensor, 'contiguous'):
-    logger.warn(
-        "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.contiguous = contiguous
-
-
-def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
-    nargs = len(args)
-    assert (nargs <= 1)
-    s = paddle.shape(xs)
-    if nargs == 1:
-        return s[args[0]]
-    else:
-        return s
-
-
-#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.warn(
-    "override size of paddle.Tensor "
-    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
-)
-paddle.Tensor.size = size
-
-
-def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
-    return xs.reshape(args)
-
-
-if not hasattr(paddle.Tensor, 'view'):
-    logger.warn("register user view to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.view = view
-
-
-def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
-    return xs.reshape(ys.size())
-
-
-if not hasattr(paddle.Tensor, 'view_as'):
-    logger.warn(
-        "register user view_as to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.view_as = view_as
-
-
-def is_broadcastable(shp1, shp2):
-    for a, b in zip(shp1[::-1], shp2[::-1]):
-        if a == 1 or b == 1 or a == b:
-            pass
-        else:
-            return False
-    return True
-
-
-def masked_fill(xs: paddle.Tensor,
-                mask: paddle.Tensor,
-                value: Union[float, int]):
-    assert is_broadcastable(xs.shape, mask.shape) is True
-    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
-    mask = mask.broadcast_to(bshape)
-    trues = paddle.ones_like(xs) * value
-    xs = paddle.where(mask, trues, xs)
-    return xs
-
-
-if not hasattr(paddle.Tensor, 'masked_fill'):
-    logger.warn(
-        "register user masked_fill to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.masked_fill = masked_fill
-
-
-def masked_fill_(xs: paddle.Tensor,
-                 mask: paddle.Tensor,
-                 value: Union[float, int]) -> paddle.Tensor:
-    assert is_broadcastable(xs.shape, mask.shape) is True
-    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
-    mask = mask.broadcast_to(bshape)
-    trues = paddle.ones_like(xs) * value
-    ret = paddle.where(mask, trues, xs)
-    paddle.assign(ret.detach(), output=xs)
-    return xs
-
-
-if not hasattr(paddle.Tensor, 'masked_fill_'):
-    logger.warn(
-        "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.masked_fill_ = masked_fill_
-
-
-def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
-    val = paddle.full_like(xs, value)
-    paddle.assign(val.detach(), output=xs)
-    return xs
-
-
-if not hasattr(paddle.Tensor, 'fill_'):
-    logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.fill_ = fill_
-
-
-def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
-    return paddle.tile(xs, size)
-
-
-if not hasattr(paddle.Tensor, 'repeat'):
-    logger.warn(
-        "register user repeat to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.repeat = repeat
-
-if not hasattr(paddle.Tensor, 'softmax'):
-    logger.warn(
-        "register user softmax to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)
-
-if not hasattr(paddle.Tensor, 'sigmoid'):
-    logger.warn(
-        "register user sigmoid to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)
-
-if not hasattr(paddle.Tensor, 'relu'):
-    logger.warn("register user relu to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)
-
-
-def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:
-    return x.astype(other.dtype)
-
-
-if not hasattr(paddle.Tensor, 'type_as'):
-    logger.warn(
-        "register user type_as to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'type_as', type_as)
-
-
-def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-    assert len(args) == 1
-    if isinstance(args[0], str):  # dtype
-        return x.astype(args[0])
-    elif isinstance(args[0], paddle.Tensor):  #Tensor
-        return x.astype(args[0].dtype)
-    else:  # Device
-        return x
-
-
-if not hasattr(paddle.Tensor, 'to'):
-    logger.warn("register user to to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'to', to)
-
-
-def func_float(x: paddle.Tensor) -> paddle.Tensor:
-    return x.astype(paddle.float)
-
-
-if not hasattr(paddle.Tensor, 'float'):
-    logger.warn("register user float to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'float', func_float)
-
-
-def func_int(x: paddle.Tensor) -> paddle.Tensor:
-    return x.astype(paddle.int)
-
-
-if not hasattr(paddle.Tensor, 'int'):
-    logger.warn("register user int to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'int', func_int)
-
-
-def tolist(x: paddle.Tensor) -> List[Any]:
-    return x.numpy().tolist()
-
-
-if not hasattr(paddle.Tensor, 'tolist'):
-    logger.warn(
-        "register user tolist to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'tolist', tolist)
-
-
-########### hack paddle.nn #############
-class GLU(nn.Layer):
-    """Gated Linear Units (GLU) Layer"""
-
-    def __init__(self, dim: int=-1):
-        super().__init__()
-        self.dim = dim
-
-    def forward(self, xs):
-        return F.glu(xs, axis=self.dim)
-
-
-if not hasattr(paddle.nn, 'GLU'):
-    logger.warn("register user GLU to paddle.nn, remove this when fixed!")
-    setattr(paddle.nn, 'GLU', GLU)
diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py
deleted file mode 100644
index 88a13fdca..000000000
--- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Evaluation for DeepSpeech2 model."""
-from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester
-from yacs.config import CfgNode
-
-from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
-
-
-def main_sp(config, args):
-    exp = Tester(config, args)
-    exp.setup()
-    exp.run_test()
-
-
-def main(config, args):
-    main_sp(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    parser.add_argument(
-        "--model_type", type=str, default='offline', help='offline/online')
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    args = parser.parse_args()
-    print_arguments(args, globals())
-    print("model_type:{}".format(args.model_type))
-
-    # https://yaml.org/type/float.html
-    config = CfgNode(new_allowed=True)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.decode_cfg:
-        decode_confs = CfgNode(new_allowed=True)
-        decode_confs.merge_from_file(args.decode_cfg)
-        config.decode = decode_confs
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/__init__.py b/examples/other/1xt2x/src_deepspeech2x/models/__init__.py
deleted file mode 100644
index 185a92b8d..000000000
--- a/examples/other/1xt2x/src_deepspeech2x/models/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/__init__.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/__init__.py
deleted file mode 100644
index 39bea5bf9..000000000
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .deepspeech2 import DeepSpeech2InferModel
-from .deepspeech2 import DeepSpeech2Model
-
-__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
deleted file mode 100644
index f6e185ff1..000000000
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Deepspeech2 ASR Model"""
-import paddle
-from paddle import nn
-from src_deepspeech2x.models.ds2.rnn import RNNStack
-
-from paddlespeech.s2t.models.ds2.conv import ConvStack
-from paddlespeech.s2t.modules.ctc import CTCDecoder
-from paddlespeech.s2t.utils import layer_tools
-from paddlespeech.s2t.utils.checkpoint import Checkpoint
-from paddlespeech.s2t.utils.log import Log
-logger = Log(__name__).getlog()
-
-__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
-
-
-class CRNNEncoder(nn.Layer):
-    def __init__(self,
-                 feat_size,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=1024,
-                 use_gru=False,
-                 share_rnn_weights=True):
-        super().__init__()
-        self.rnn_size = rnn_size
-        self.feat_size = feat_size  # 161 for linear
-        self.dict_size = dict_size
-
-        self.conv = ConvStack(feat_size, num_conv_layers)
-
-        i_size = self.conv.output_height  # H after conv stack
-        self.rnn = RNNStack(
-            i_size=i_size,
-            h_size=rnn_size,
-            num_stacks=num_rnn_layers,
-            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
-
-    @property
-    def output_size(self):
-        return self.rnn_size * 2
-
-    def forward(self, audio, audio_len):
-        """Compute Encoder outputs
-
-        Args:
-            audio (Tensor): [B, Tmax, D]
-            text (Tensor): [B, Umax]
-            audio_len (Tensor): [B]
-            text_len (Tensor): [B]
-        Returns:
-            x (Tensor): encoder outputs, [B, T, D]
-            x_lens (Tensor): encoder length, [B]
-        """
-        # [B, T, D]  -> [B, D, T]
-        audio = audio.transpose([0, 2, 1])
-        # [B, D, T] -> [B, C=1, D, T]
-        x = audio.unsqueeze(1)
-        x_lens = audio_len
-
-        # convolution group
-        x, x_lens = self.conv(x, x_lens)
-        x_val = x.numpy()
-
-        # convert data from convolution feature map to sequence of vectors
-        #B, C, D, T = paddle.shape(x)  # not work under jit
-        x = x.transpose([0, 3, 1, 2])  #[B, T, C, D]
-        #x = x.reshape([B, T, C * D])  #[B, T, C*D]  # not work under jit
-        x = x.reshape([0, 0, -1])  #[B, T, C*D]
-
-        # remove padding part
-        x, x_lens = self.rnn(x, x_lens)  #[B, T, D]
-        return x, x_lens
-
-
-class DeepSpeech2Model(nn.Layer):
-    """The DeepSpeech2 network structure.
-
-    :param audio_data: Audio spectrogram data layer.
-    :type audio_data: Variable
-    :param text_data: Transcription text data layer.
-    :type text_data: Variable
-    :param audio_len: Valid sequence length data layer.
-    :type audio_len: Variable
-    :param masks: Masks data layer to reset padding.
-    :type masks: Variable
-    :param dict_size: Dictionary size for tokenized transcription.
-    :type dict_size: int
-    :param num_conv_layers: Number of stacking convolution layers.
-    :type num_conv_layers: int
-    :param num_rnn_layers: Number of stacking RNN layers.
-    :type num_rnn_layers: int
-    :param rnn_size: RNN layer size (dimension of RNN cells).
-    :type rnn_size: int
-    :param use_gru: Use gru if set True. Use simple rnn if set False.
-    :type use_gru: bool
-    :param share_rnn_weights: Whether to share input-hidden weights between
-                              forward and backward direction RNNs.
-                              It is only available when use_gru=False.
-    :type share_weights: bool
-    :return: A tuple of an output unnormalized log probability layer (
-             before softmax) and a ctc cost layer.
-    :rtype: tuple of LayerOutput
-    """
-
-    def __init__(self,
-                 feat_size,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=1024,
-                 use_gru=False,
-                 share_rnn_weights=True,
-                 blank_id=0):
-        super().__init__()
-        self.encoder = CRNNEncoder(
-            feat_size=feat_size,
-            dict_size=dict_size,
-            num_conv_layers=num_conv_layers,
-            num_rnn_layers=num_rnn_layers,
-            rnn_size=rnn_size,
-            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
-        assert (self.encoder.output_size == rnn_size * 2)
-
-        self.decoder = CTCDecoder(
-            odim=dict_size,  # <blank> is in  vocab
-            enc_n_units=self.encoder.output_size,
-            blank_id=blank_id,  # first token is <blank>
-            dropout_rate=0.0,
-            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
-
-    def forward(self, audio, audio_len, text, text_len):
-        """Compute Model loss
-
-        Args:
-            audio (Tensor): [B, T, D]
-            audio_len (Tensor): [B]
-            text (Tensor): [B, U]
-            text_len (Tensor): [B]
-
-        Returns:
-            loss (Tensor): [1]
-        """
-        eouts, eouts_len = self.encoder(audio, audio_len)
-        loss = self.decoder(eouts, eouts_len, text, text_len)
-        return loss
-
-    @paddle.no_grad()
-    def decode(self, audio, audio_len):
-        # decoders only accept string encoded in utf-8
-
-        # Make sure the decoder has been initialized
-        eouts, eouts_len = self.encoder(audio, audio_len)
-        probs = self.decoder.softmax(eouts)
-        batch_size = probs.shape[0]
-        self.decoder.reset_decoder(batch_size=batch_size)
-        self.decoder.next(probs, eouts_len)
-        trans_best, trans_beam = self.decoder.decode()
-        return trans_best
-
-    @classmethod
-    def from_pretrained(cls, dataloader, config, checkpoint_path):
-        """Build a DeepSpeech2Model model from a pretrained model.
-        Parameters
-        ----------
-        dataloader: paddle.io.DataLoader
-
-        config: yacs.config.CfgNode
-            model configs
-
-        checkpoint_path: Path or str
-            the path of pretrained model checkpoint, without extension name
-
-        Returns
-        -------
-        DeepSpeech2Model
-            The model built from pretrained result.
-        """
-        model = cls(feat_size=dataloader.collate_fn.feature_size,
-                    dict_size=len(dataloader.collate_fn.vocab_list),
-                    num_conv_layers=config.num_conv_layers,
-                    num_rnn_layers=config.num_rnn_layers,
-                    rnn_size=config.rnn_layer_size,
-                    use_gru=config.use_gru,
-                    share_rnn_weights=config.share_rnn_weights)
-        infos = Checkpoint().load_parameters(
-            model, checkpoint_path=checkpoint_path)
-        logger.info(f"checkpoint info: {infos}")
-        layer_tools.summary(model)
-        return model
-
-    @classmethod
-    def from_config(cls, config):
-        """Build a DeepSpeec2Model from config
-        Parameters
-
-        config: yacs.config.CfgNode
-            config
-        Returns
-        -------
-        DeepSpeech2Model
-            The model built from config.
-        """
-        model = cls(feat_size=config.feat_size,
-                    dict_size=config.dict_size,
-                    num_conv_layers=config.num_conv_layers,
-                    num_rnn_layers=config.num_rnn_layers,
-                    rnn_size=config.rnn_layer_size,
-                    use_gru=config.use_gru,
-                    share_rnn_weights=config.share_rnn_weights,
-                    blank_id=config.blank_id)
-        return model
-
-
-class DeepSpeech2InferModel(DeepSpeech2Model):
-    def __init__(self,
-                 feat_size,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=1024,
-                 use_gru=False,
-                 share_rnn_weights=True,
-                 blank_id=0):
-        super().__init__(
-            feat_size=feat_size,
-            dict_size=dict_size,
-            num_conv_layers=num_conv_layers,
-            num_rnn_layers=num_rnn_layers,
-            rnn_size=rnn_size,
-            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights,
-            blank_id=blank_id)
-
-    def forward(self, audio, audio_len):
-        """export model function
-
-        Args:
-            audio (Tensor): [B, T, D]
-            audio_len (Tensor): [B]
-
-        Returns:
-            probs: probs after softmax
-        """
-        eouts, eouts_len = self.encoder(audio, audio_len)
-        probs = self.decoder.softmax(eouts)
-        return probs, eouts_len
-
-    def export(self):
-        static_model = paddle.jit.to_static(
-            self,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, None, self.encoder.feat_size],
-                    dtype='float32'),  # audio, [B,T,D]
-                paddle.static.InputSpec(shape=[None],
-                                        dtype='int64'),  # audio_length, [B]
-            ])
-        return static_model
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/rnn.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/rnn.py
deleted file mode 100644
index 383a07467..000000000
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/rnn.py
+++ /dev/null
@@ -1,334 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-import paddle
-from paddle import nn
-from paddle.nn import functional as F
-from paddle.nn import initializer as I
-
-from paddlespeech.s2t.modules.activation import brelu
-from paddlespeech.s2t.modules.mask import make_non_pad_mask
-from paddlespeech.s2t.utils.log import Log
-logger = Log(__name__).getlog()
-
-__all__ = ['RNNStack']
-
-
-class RNNCell(nn.RNNCellBase):
-    r"""
-    Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
-    computes the outputs and updates states.
-    The formula used is as follows:
-    .. math::
-        h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
-        y_{t} & = h_{t}
-
-    where :math:`act` is for :attr:`activation`.
-    """
-
-    def __init__(self,
-                 hidden_size: int,
-                 activation="tanh",
-                 weight_ih_attr=None,
-                 weight_hh_attr=None,
-                 bias_ih_attr=None,
-                 bias_hh_attr=None,
-                 name=None):
-        super().__init__()
-        std = 1.0 / math.sqrt(hidden_size)
-        self.weight_hh = self.create_parameter(
-            (hidden_size, hidden_size),
-            weight_hh_attr,
-            default_initializer=I.Uniform(-std, std))
-        self.bias_ih = None
-        self.bias_hh = self.create_parameter(
-            (hidden_size, ),
-            bias_hh_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std))
-
-        self.hidden_size = hidden_size
-        if activation not in ["tanh", "relu", "brelu"]:
-            raise ValueError(
-                "activation for SimpleRNNCell should be tanh or relu, "
-                "but get {}".format(activation))
-        self.activation = activation
-        self._activation_fn = paddle.tanh \
-            if activation == "tanh" \
-            else F.relu
-        if activation == 'brelu':
-            self._activation_fn = brelu
-
-    def forward(self, inputs, states=None):
-        if states is None:
-            states = self.get_initial_states(inputs, self.state_shape)
-        pre_h = states
-        i2h = inputs
-        if self.bias_ih is not None:
-            i2h += self.bias_ih
-        h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
-        if self.bias_hh is not None:
-            h2h += self.bias_hh
-        h = self._activation_fn(i2h + h2h)
-        return h, h
-
-    @property
-    def state_shape(self):
-        return (self.hidden_size, )
-
-
-class GRUCell(nn.RNNCellBase):
-    r"""
-    Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
-    it computes the outputs and updates states.
-    The formula for GRU used is as follows:
-    ..  math::
-        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
-        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
-        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
-        h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
-        y_{t} & = h_{t}
-
-    where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
-    multiplication operator.
-    """
-
-    def __init__(self,
-                 input_size: int,
-                 hidden_size: int,
-                 weight_ih_attr=None,
-                 weight_hh_attr=None,
-                 bias_ih_attr=None,
-                 bias_hh_attr=None,
-                 name=None):
-        super().__init__()
-        std = 1.0 / math.sqrt(hidden_size)
-        self.weight_hh = self.create_parameter(
-            (3 * hidden_size, hidden_size),
-            weight_hh_attr,
-            default_initializer=I.Uniform(-std, std))
-        self.bias_ih = None
-        self.bias_hh = self.create_parameter(
-            (3 * hidden_size, ),
-            bias_hh_attr,
-            is_bias=True,
-            default_initializer=I.Uniform(-std, std))
-
-        self.hidden_size = hidden_size
-        self.input_size = input_size
-        self._gate_activation = F.sigmoid
-        self._activation = paddle.relu
-
-    def forward(self, inputs, states=None):
-        if states is None:
-            states = self.get_initial_states(inputs, self.state_shape)
-
-        pre_hidden = states  # shape [batch_size, hidden_size]
-
-        x_gates = inputs
-        if self.bias_ih is not None:
-            x_gates = x_gates + self.bias_ih
-        bias_u, bias_r, bias_c = paddle.split(
-            self.bias_hh, num_or_sections=3, axis=0)
-
-        weight_hh = paddle.transpose(
-            self.weight_hh,
-            perm=[1, 0])  #weight_hh:shape[hidden_size, 3 * hidden_size]
-        w_u_r_c = paddle.flatten(weight_hh)
-        size_u_r = self.hidden_size * 2 * self.hidden_size
-        w_u_r = paddle.reshape(w_u_r_c[:size_u_r],
-                               (self.hidden_size, self.hidden_size * 2))
-        w_u, w_r = paddle.split(w_u_r, num_or_sections=2, axis=1)
-        w_c = paddle.reshape(w_u_r_c[size_u_r:],
-                             (self.hidden_size, self.hidden_size))
-
-        h_u = paddle.matmul(
-            pre_hidden, w_u,
-            transpose_y=False) + bias_u  #shape [batch_size, hidden_size]
-        h_r = paddle.matmul(
-            pre_hidden, w_r,
-            transpose_y=False) + bias_r  #shape [batch_size, hidden_size]
-
-        x_u, x_r, x_c = paddle.split(
-            x_gates, num_or_sections=3, axis=1)  #shape[batch_size, hidden_size]
-
-        u = self._gate_activation(x_u + h_u)  #shape [batch_size, hidden_size]
-        r = self._gate_activation(x_r + h_r)  #shape [batch_size, hidden_size]
-        c = self._activation(
-            x_c + paddle.matmul(r * pre_hidden, w_c, transpose_y=False) +
-            bias_c)  # [batch_size, hidden_size]
-
-        h = (1 - u) * pre_hidden + u * c
-        # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
-        return h, h
-
-    @property
-    def state_shape(self):
-        r"""
-        The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
-        size would be automatically inserted into shape). The shape corresponds
-        to the shape of :math:`h_{t-1}`.
-        """
-        return (self.hidden_size, )
-
-
-class BiRNNWithBN(nn.Layer):
-    """Bidirectonal simple rnn layer with sequence-wise batch normalization.
-    The batch normalization is only performed on input-state weights.
-
-    :param size: Dimension of RNN cells.
-    :type size: int
-    :param share_weights: Whether to share input-hidden weights between
-                          forward and backward directional RNNs.
-    :type share_weights: bool
-    :return: Bidirectional simple rnn layer.
-    :rtype: Variable
-    """
-
-    def __init__(self, i_size: int, h_size: int, share_weights: bool):
-        super().__init__()
-        self.share_weights = share_weights
-        if self.share_weights:
-            #input-hidden weights shared between bi-directional rnn.
-            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
-            # batch norm is only performed on input-state projection
-            self.fw_bn = nn.BatchNorm1D(
-                h_size, bias_attr=None, data_format='NLC')
-            self.bw_fc = self.fw_fc
-            self.bw_bn = self.fw_bn
-        else:
-            self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
-            self.fw_bn = nn.BatchNorm1D(
-                h_size, bias_attr=None, data_format='NLC')
-            self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
-            self.bw_bn = nn.BatchNorm1D(
-                h_size, bias_attr=None, data_format='NLC')
-
-        self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
-        self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
-        self.fw_rnn = nn.RNN(
-            self.fw_cell, is_reverse=False, time_major=False)  #[B, T, D]
-        self.bw_rnn = nn.RNN(
-            self.bw_cell, is_reverse=True, time_major=False)  #[B, T, D]
-
-    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
-        # x, shape [B, T, D]
-        fw_x = self.fw_bn(self.fw_fc(x))
-        bw_x = self.bw_bn(self.bw_fc(x))
-        fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
-        bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
-        x = paddle.concat([fw_x, bw_x], axis=-1)
-        return x, x_len
-
-
-class BiGRUWithBN(nn.Layer):
-    """Bidirectonal gru layer with sequence-wise batch normalization.
-    The batch normalization is only performed on input-state weights.
-
-    :param name: Name of the layer.
-    :type name: string
-    :param input: Input layer.
-    :type input: Variable
-    :param size: Dimension of GRU cells.
-    :type size: int
-    :param act: Activation type.
-    :type act: string
-    :return: Bidirectional GRU layer.
-    :rtype: Variable
-    """
-
-    def __init__(self, i_size: int, h_size: int):
-        super().__init__()
-        hidden_size = h_size * 3
-
-        self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
-        self.fw_bn = nn.BatchNorm1D(
-            hidden_size, bias_attr=None, data_format='NLC')
-        self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
-        self.bw_bn = nn.BatchNorm1D(
-            hidden_size, bias_attr=None, data_format='NLC')
-
-        self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
-        self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
-        self.fw_rnn = nn.RNN(
-            self.fw_cell, is_reverse=False, time_major=False)  #[B, T, D]
-        self.bw_rnn = nn.RNN(
-            self.bw_cell, is_reverse=True, time_major=False)  #[B, T, D]
-
-    def forward(self, x, x_len):
-        # x, shape [B, T, D]
-        fw_x = self.fw_bn(self.fw_fc(x))
-
-        bw_x = self.bw_bn(self.bw_fc(x))
-        fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
-        bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
-        x = paddle.concat([fw_x, bw_x], axis=-1)
-        return x, x_len
-
-
-class RNNStack(nn.Layer):
-    """RNN group with stacked bidirectional simple RNN or GRU layers.
-
-    :param input: Input layer.
-    :type input: Variable
-    :param size: Dimension of RNN cells in each layer.
-    :type size: int
-    :param num_stacks: Number of stacked rnn layers.
-    :type num_stacks: int
-    :param use_gru: Use gru if set True. Use simple rnn if set False.
-    :type use_gru: bool
-    :param share_rnn_weights: Whether to share input-hidden weights between
-                              forward and backward directional RNNs.
-                              It is only available when use_gru=False.
-    :type share_weights: bool
-    :return: Output layer of the RNN group.
-    :rtype: Variable
-    """
-
-    def __init__(self,
-                 i_size: int,
-                 h_size: int,
-                 num_stacks: int,
-                 use_gru: bool,
-                 share_rnn_weights: bool):
-        super().__init__()
-        rnn_stacks = []
-        for i in range(num_stacks):
-            if use_gru:
-                #default:GRU using tanh
-                rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
-            else:
-                rnn_stacks.append(
-                    BiRNNWithBN(
-                        i_size=i_size,
-                        h_size=h_size,
-                        share_weights=share_rnn_weights))
-            i_size = h_size * 2
-
-        self.rnn_stacks = nn.LayerList(rnn_stacks)
-
-    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
-        """
-        x: shape [B, T, D]
-        x_len: shpae [B]
-        """
-        for i, rnn in enumerate(self.rnn_stacks):
-            x, x_len = rnn(x, x_len)
-            masks = make_non_pad_mask(x_len)  #[B, T]
-            masks = masks.unsqueeze(-1)  # [B, T, 1]
-            # TODO(Hui Zhang): not support bool multiply
-            masks = masks.astype(x.dtype)
-            x = x.multiply(masks)
-        return x, x_len
diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py
deleted file mode 100644
index 11b85442d..000000000
--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Contains DeepSpeech2 and DeepSpeech2Online model."""
-import time
-from collections import defaultdict
-from contextlib import nullcontext
-
-import numpy as np
-import paddle
-from paddle import distributed as dist
-from paddle.io import DataLoader
-from src_deepspeech2x.models.ds2 import DeepSpeech2InferModel
-from src_deepspeech2x.models.ds2 import DeepSpeech2Model
-
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.io.collator import SpeechCollator
-from paddlespeech.s2t.io.dataset import ManifestDataset
-from paddlespeech.s2t.io.sampler import SortagradBatchSampler
-from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
-from paddlespeech.s2t.training.trainer import Trainer
-from paddlespeech.s2t.utils import error_rate
-from paddlespeech.s2t.utils import layer_tools
-from paddlespeech.s2t.utils import mp_tools
-from paddlespeech.s2t.utils.log import Log
-
-logger = Log(__name__).getlog()
-
-
-class DeepSpeech2Trainer(Trainer):
-    def __init__(self, config, args):
-        super().__init__(config, args)
-
-    def train_batch(self, batch_index, batch_data, msg):
-        train_conf = self.config
-        start = time.time()
-
-        # forward
-        utt, audio, audio_len, text, text_len = batch_data
-        loss = self.model(audio, audio_len, text, text_len)
-        losses_np = {
-            'train_loss': float(loss),
-        }
-
-        # loss backward
-        if (batch_index + 1) % train_conf.accum_grad != 0:
-            # Disable gradient synchronizations across DDP processes.
-            # Within this context, gradients will be accumulated on module
-            # variables, which will later be synchronized.
-            context = self.model.no_sync
-        else:
-            # Used for single gpu training and DDP gradient synchronization
-            # processes.
-            context = nullcontext
-
-        with context():
-            loss.backward()
-            layer_tools.print_grads(self.model, print_func=None)
-
-        # optimizer step
-        if (batch_index + 1) % train_conf.accum_grad == 0:
-            self.optimizer.step()
-            self.optimizer.clear_grad()
-            self.iteration += 1
-
-        iteration_time = time.time() - start
-
-        msg += "train time: {:>.3f}s, ".format(iteration_time)
-        msg += "batch size: {}, ".format(self.config.batch_size)
-        msg += "accum: {}, ".format(train_conf.accum_grad)
-        msg += ', '.join('{}: {:>.6f}'.format(k, v)
-                         for k, v in losses_np.items())
-        logger.info(msg)
-
-        if dist.get_rank() == 0 and self.visualizer:
-            for k, v in losses_np.items():
-                # `step -1` since we update `step` after optimizer.step().
-                self.visualizer.add_scalar("train/{}".format(k), v,
-                                           self.iteration - 1)
-
-    @paddle.no_grad()
-    def valid(self):
-        logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
-        self.model.eval()
-        valid_losses = defaultdict(list)
-        num_seen_utts = 1
-        total_loss = 0.0
-        for i, batch in enumerate(self.valid_loader):
-            utt, audio, audio_len, text, text_len = batch
-            loss = self.model(audio, audio_len, text, text_len)
-            if paddle.isfinite(loss):
-                num_utts = batch[1].shape[0]
-                num_seen_utts += num_utts
-                total_loss += float(loss) * num_utts
-                valid_losses['val_loss'].append(float(loss))
-
-            if (i + 1) % self.config.log_interval == 0:
-                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
-                valid_dump['val_history_loss'] = total_loss / num_seen_utts
-
-                # logging
-                msg = f"Valid: Rank: {dist.get_rank()}, "
-                msg += "epoch: {}, ".format(self.epoch)
-                msg += "step: {}, ".format(self.iteration)
-                msg += "batch : {}/{}, ".format(i + 1, len(self.valid_loader))
-                msg += ', '.join('{}: {:>.6f}'.format(k, v)
-                                 for k, v in valid_dump.items())
-                logger.info(msg)
-
-        logger.info('Rank {} Val info val_loss {}'.format(
-            dist.get_rank(), total_loss / num_seen_utts))
-        return total_loss, num_seen_utts
-
-    def setup_model(self):
-        config = self.config.clone()
-        config.defrost()
-        config.feat_size = self.train_loader.collate_fn.feature_size
-        #config.dict_size = self.train_loader.collate_fn.vocab_size
-        config.dict_size = len(self.train_loader.collate_fn.vocab_list)
-        config.freeze()
-
-        if self.args.model_type == 'offline':
-            model = DeepSpeech2Model.from_config(config)
-        elif self.args.model_type == 'online':
-            model = DeepSpeech2ModelOnline.from_config(config)
-        else:
-            raise Exception("wrong model type")
-        if self.parallel:
-            model = paddle.DataParallel(model)
-
-        logger.info(f"{model}")
-        layer_tools.print_params(model, logger.info)
-
-        grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
-        lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
-            learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=lr_scheduler,
-            parameters=model.parameters(),
-            weight_decay=paddle.regularizer.L2Decay(config.weight_decay),
-            grad_clip=grad_clip)
-
-        self.model = model
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
-        logger.info("Setup model/optimizer/lr_scheduler!")
-
-    def setup_dataloader(self):
-        config = self.config.clone()
-        config.defrost()
-        config.keep_transcription_text = False
-
-        config.manifest = config.train_manifest
-        train_dataset = ManifestDataset.from_config(config)
-
-        config.manifest = config.dev_manifest
-        dev_dataset = ManifestDataset.from_config(config)
-
-        config.manifest = config.test_manifest
-        test_dataset = ManifestDataset.from_config(config)
-
-        if self.parallel:
-            batch_sampler = SortagradDistributedBatchSampler(
-                train_dataset,
-                batch_size=config.batch_size,
-                num_replicas=None,
-                rank=None,
-                shuffle=True,
-                drop_last=True,
-                sortagrad=config.sortagrad,
-                shuffle_method=config.shuffle_method)
-        else:
-            batch_sampler = SortagradBatchSampler(
-                train_dataset,
-                shuffle=True,
-                batch_size=config.batch_size,
-                drop_last=True,
-                sortagrad=config.sortagrad,
-                shuffle_method=config.shuffle_method)
-
-        collate_fn_train = SpeechCollator.from_config(config)
-
-        config.augmentation_config = ""
-        collate_fn_dev = SpeechCollator.from_config(config)
-
-        config.keep_transcription_text = True
-        config.augmentation_config = ""
-        collate_fn_test = SpeechCollator.from_config(config)
-
-        self.train_loader = DataLoader(
-            train_dataset,
-            batch_sampler=batch_sampler,
-            collate_fn=collate_fn_train,
-            num_workers=config.num_workers)
-        self.valid_loader = DataLoader(
-            dev_dataset,
-            batch_size=config.batch_size,
-            shuffle=False,
-            drop_last=False,
-            collate_fn=collate_fn_dev)
-        self.test_loader = DataLoader(
-            test_dataset,
-            batch_size=config.decode.decode_batch_size,
-            shuffle=False,
-            drop_last=False,
-            collate_fn=collate_fn_test)
-        if "<eos>" in self.test_loader.collate_fn.vocab_list:
-            self.test_loader.collate_fn.vocab_list.remove("<eos>")
-        if "<eos>" in self.valid_loader.collate_fn.vocab_list:
-            self.valid_loader.collate_fn.vocab_list.remove("<eos>")
-        if "<eos>" in self.train_loader.collate_fn.vocab_list:
-            self.train_loader.collate_fn.vocab_list.remove("<eos>")
-        logger.info("Setup train/valid/test  Dataloader!")
-
-
-class DeepSpeech2Tester(DeepSpeech2Trainer):
-    def __init__(self, config, args):
-
-        self._text_featurizer = TextFeaturizer(
-            unit_type=config.unit_type, vocab=None)
-        super().__init__(config, args)
-
-    def ordid2token(self, texts, texts_len):
-        """ ord() id to chr() chr """
-        trans = []
-        for text, n in zip(texts, texts_len):
-            n = n.numpy().item()
-            ids = text[:n]
-            trans.append(''.join([chr(i) for i in ids]))
-        return trans
-
-    def compute_metrics(self,
-                        utts,
-                        audio,
-                        audio_len,
-                        texts,
-                        texts_len,
-                        fout=None):
-        cfg = self.config.decode
-        errors_sum, len_refs, num_ins = 0.0, 0, 0
-        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
-        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
-
-        target_transcripts = self.ordid2token(texts, texts_len)
-
-        result_transcripts = self.compute_result_transcripts(audio, audio_len)
-
-        for utt, target, result in zip(utts, target_transcripts,
-                                       result_transcripts):
-            errors, len_ref = errors_func(target, result)
-            errors_sum += errors
-            len_refs += len_ref
-            num_ins += 1
-            if fout:
-                fout.write(utt + " " + result + "\n")
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
-                        (target, result))
-            logger.info("Current error rate [%s] = %f" %
-                        (cfg.error_rate_type, error_rate_func(target, result)))
-
-        return dict(
-            errors_sum=errors_sum,
-            len_refs=len_refs,
-            num_ins=num_ins,
-            error_rate=errors_sum / len_refs,
-            error_rate_type=cfg.error_rate_type)
-
-    def compute_result_transcripts(self, audio, audio_len):
-        result_transcripts = self.model.decode(audio, audio_len)
-
-        result_transcripts = [
-            self._text_featurizer.detokenize(item)
-            for item in result_transcripts
-        ]
-        return result_transcripts
-
-    @mp_tools.rank_zero_only
-    @paddle.no_grad()
-    def test(self):
-        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
-        self.model.eval()
-        cfg = self.config
-        error_rate_type = None
-        errors_sum, len_refs, num_ins = 0.0, 0, 0
-
-        # Initialized the decoder in model
-        decode_cfg = self.config.decode
-        vocab_list = self.test_loader.collate_fn.vocab_list
-        decode_batch_size = self.test_loader.batch_size
-        self.model.decoder.init_decoder(
-            decode_batch_size, vocab_list, decode_cfg.decoding_method,
-            decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
-            decode_cfg.beam_size, decode_cfg.cutoff_prob,
-            decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
-
-        with open(self.args.result_file, 'w') as fout:
-            for i, batch in enumerate(self.test_loader):
-                utts, audio, audio_len, texts, texts_len = batch
-                metrics = self.compute_metrics(utts, audio, audio_len, texts,
-                                               texts_len, fout)
-                errors_sum += metrics['errors_sum']
-                len_refs += metrics['len_refs']
-                num_ins += metrics['num_ins']
-                error_rate_type = metrics['error_rate_type']
-                logger.info("Error rate [%s] (%d/?) = %f" %
-                            (error_rate_type, num_ins, errors_sum / len_refs))
-
-        # logging
-        msg = "Test: "
-        msg += "epoch: {}, ".format(self.epoch)
-        msg += "step: {}, ".format(self.iteration)
-        msg += "Final error rate [%s] (%d/%d) = %f" % (
-            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
-        logger.info(msg)
-        self.model.decoder.del_decoder()
-
-    def run_test(self):
-        self.resume_or_scratch()
-        try:
-            self.test()
-        except KeyboardInterrupt:
-            exit(-1)
-
-    def export(self):
-        if self.args.model_type == 'offline':
-            infer_model = DeepSpeech2InferModel.from_pretrained(
-                self.test_loader, self.config, self.args.checkpoint_path)
-        elif self.args.model_type == 'online':
-            infer_model = DeepSpeech2InferModelOnline.from_pretrained(
-                self.test_loader, self.config, self.args.checkpoint_path)
-        else:
-            raise Exception("wrong model type")
-
-        infer_model.eval()
-        feat_dim = self.test_loader.collate_fn.feature_size
-        static_model = infer_model.export()
-        logger.info(f"Export code: {static_model.forward.code}")
-        paddle.jit.save(static_model, self.args.export_path)
-
-    def run_export(self):
-        try:
-            self.export()
-        except KeyboardInterrupt:
-            exit(-1)

From bb8c999fea8480b5f4d58ce110e58bbf3bb84b21 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Fri, 27 May 2022 18:25:09 +0800
Subject: [PATCH 24/40] fix the pathos==0.2.8, test=doc

---
 audio/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/audio/setup.py b/audio/setup.py
index ec67c81de..02b97f070 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -83,7 +83,7 @@ setuptools.setup(
     python_requires='>=3.6',
     install_requires=[
         'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
-        'soundfile >= 0.9.0', 'colorlog', 'dtaidistance == 2.3.1', 'pathos'
+        'soundfile >= 0.9.0', 'colorlog', 'dtaidistance == 2.3.1', 'pathos==0.2.8'
     ],
     extras_require={
         'test': [

From 73f9ca82cc4bdb7279394770377939b0f0c18c92 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Fri, 27 May 2022 20:19:40 +0800
Subject: [PATCH 25/40] Update __init__.py

---
 audio/paddleaudio/metric/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/audio/paddleaudio/metric/__init__.py b/audio/paddleaudio/metric/__init__.py
index d2b3a1360..7ce6f5cff 100644
--- a/audio/paddleaudio/metric/__init__.py
+++ b/audio/paddleaudio/metric/__init__.py
@@ -11,6 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .dtw import dtw_distance
 from .eer import compute_eer
 from .eer import compute_minDCF

From e94c3884a13f3379d795e99239501442ea125c2a Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Fri, 27 May 2022 20:20:01 +0800
Subject: [PATCH 26/40] Delete dtw.py

---
 audio/paddleaudio/metric/dtw.py | 44 ---------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 audio/paddleaudio/metric/dtw.py

diff --git a/audio/paddleaudio/metric/dtw.py b/audio/paddleaudio/metric/dtw.py
deleted file mode 100644
index 662e4506d..000000000
--- a/audio/paddleaudio/metric/dtw.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from dtaidistance import dtw_ndim
-
-__all__ = [
-    'dtw_distance',
-]
-
-
-def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
-    """Dynamic Time Warping.
-    This function keeps a compact matrix, not the full warping paths matrix.
-    Uses dynamic programming to compute:
-
-    Examples:
-        .. code-block:: python
-
-            wps[i, j] = (s1[i]-s2[j])**2 + min(
-                            wps[i-1, j  ] + penalty,  // vertical   / insertion / expansion
-                            wps[i  , j-1] + penalty,  // horizontal / deletion  / compression
-                            wps[i-1, j-1])            // diagonal   / match
-
-            dtw = sqrt(wps[-1, -1])
-
-    Args:
-        xs (np.ndarray): ref sequence, [T,D]
-        ys (np.ndarray): hyp sequence, [T,D]
-
-    Returns:
-        float: dtw distance
-    """
-    return dtw_ndim.distance(xs, ys)

From 21402cd3be7e0017c064cec686cfdd939d0ac330 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Fri, 27 May 2022 20:21:33 +0800
Subject: [PATCH 27/40] Update setup.py

---
 audio/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/audio/setup.py b/audio/setup.py
index 02b97f070..80fe07b7a 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -83,7 +83,7 @@ setuptools.setup(
     python_requires='>=3.6',
     install_requires=[
         'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
-        'soundfile >= 0.9.0', 'colorlog', 'dtaidistance == 2.3.1', 'pathos==0.2.8'
+        'soundfile >= 0.9.0', 'colorlog', 'pathos == 0.2.8'
     ],
     extras_require={
         'test': [

From b6ad4260eb0e233bc91d80fb6acbc72f074710fd Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Sun, 29 May 2022 04:12:43 +0000
Subject: [PATCH 28/40] fix bug in tts cli, test=tts

---
 paddlespeech/cli/tts/infer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 5fa9b3ed0..879d4a4db 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -209,7 +209,7 @@ class TTSExecutor(BaseExecutor):
         self.tones_dict = None
         if 'tones_dict' in self.pretrained_models[am_tag]:
             self.tones_dict = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['tones_dict'])
+                self.am_res_path, self.pretrained_models[am_tag]['tones_dict'])
             if tones_dict:
                 self.tones_dict = tones_dict
 
@@ -217,7 +217,8 @@ class TTSExecutor(BaseExecutor):
         self.speaker_dict = None
         if 'speaker_dict' in self.pretrained_models[am_tag]:
             self.speaker_dict = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
+                self.am_res_path,
+                self.pretrained_models[am_tag]['speaker_dict'])
             if speaker_dict:
                 self.speaker_dict = speaker_dict
 

From fa6e44e4fff6def13316cae5cdd35cbb79a4be08 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Tue, 17 May 2022 17:39:18 +0800
Subject: [PATCH 29/40] Add paddlespeech.resource.

---
 paddlespeech/cli/asr/infer.py                 |  31 +-
 paddlespeech/cli/asr/pretrained_models.py     | 151 ----
 paddlespeech/cli/base_commands.py             |  63 +-
 paddlespeech/cli/cls/infer.py                 |  31 +-
 paddlespeech/cli/cls/pretrained_models.py     |  47 -
 paddlespeech/cli/executor.py                  |  49 +-
 paddlespeech/cli/st/infer.py                  |  36 +-
 paddlespeech/cli/st/pretrained_models.py      |  35 -
 paddlespeech/cli/stats/infer.py               | 146 ----
 paddlespeech/cli/text/infer.py                |  30 +-
 paddlespeech/cli/text/pretrained_models.py    |  54 --
 paddlespeech/cli/tts/infer.py                 |  70 +-
 paddlespeech/cli/tts/pretrained_models.py     | 300 -------
 paddlespeech/cli/vector/infer.py              |  29 +-
 paddlespeech/cli/vector/pretrained_models.py  |  36 -
 .../{cli/stats => resource}/__init__.py       |   4 +-
 paddlespeech/resource/pretrained_models.py    | 822 ++++++++++++++++++
 paddlespeech/resource/resource.py             | 222 +++++
 .../server/bin/paddlespeech_server.py         | 110 +--
 .../server/engine/asr/online/asr_engine.py    |  40 +-
 .../engine/asr/online/pretrained_models.py    |  70 --
 .../engine/asr/paddleinference/asr_engine.py  |  25 +-
 .../asr/paddleinference/pretrained_models.py  |  34 -
 .../engine/cls/paddleinference/cls_engine.py  |  18 +-
 .../cls/paddleinference/pretrained_models.py  |  58 --
 .../tts/online/onnx/pretrained_models.py      |  69 --
 .../engine/tts/online/onnx/tts_engine.py      |  41 +-
 .../tts/online/python/pretrained_models.py    |  73 --
 .../engine/tts/online/python/tts_engine.py    |  57 +-
 .../tts/paddleinference/pretrained_models.py  |  87 --
 .../engine/tts/paddleinference/tts_engine.py  |  51 +-
 31 files changed, 1382 insertions(+), 1507 deletions(-)
 delete mode 100644 paddlespeech/cli/asr/pretrained_models.py
 delete mode 100644 paddlespeech/cli/cls/pretrained_models.py
 delete mode 100644 paddlespeech/cli/st/pretrained_models.py
 delete mode 100644 paddlespeech/cli/stats/infer.py
 delete mode 100644 paddlespeech/cli/text/pretrained_models.py
 delete mode 100644 paddlespeech/cli/tts/pretrained_models.py
 delete mode 100644 paddlespeech/cli/vector/pretrained_models.py
 rename paddlespeech/{cli/stats => resource}/__init__.py (83%)
 create mode 100644 paddlespeech/resource/pretrained_models.py
 create mode 100644 paddlespeech/resource/resource.py
 delete mode 100644 paddlespeech/server/engine/asr/online/pretrained_models.py
 delete mode 100644 paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
 delete mode 100644 paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
 delete mode 100644 paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
 delete mode 100644 paddlespeech/server/engine/tts/online/python/pretrained_models.py
 delete mode 100644 paddlespeech/server/engine/tts/paddleinference/pretrained_models.py

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 09e8202fd..842acf5ce 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -33,11 +33,8 @@ from ..utils import CLI_TIMER
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
 from ..utils import timer_register
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.transform.transformation import Transformation
-from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.utility import UpdateConfig
 
 __all__ = ['ASRExecutor']
@@ -46,10 +43,7 @@ __all__ = ['ASRExecutor']
 @timer_register
 class ASRExecutor(BaseExecutor):
     def __init__(self):
-        super().__init__()
-        self.model_alias = model_alias
-        self.pretrained_models = pretrained_models
-
+        super().__init__(task='asr', inference_type='offline')
         self.parser = argparse.ArgumentParser(
             prog='paddlespeech.asr', add_help=True)
         self.parser.add_argument(
@@ -59,7 +53,8 @@ class ASRExecutor(BaseExecutor):
             type=str,
             default='conformer_wenetspeech',
             choices=[
-                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+                tag[:tag.index('-')]
+                for tag in self.task_resource.pretrained_models.keys()
             ],
             help='Choose model type of asr task.')
         self.parser.add_argument(
@@ -141,14 +136,14 @@ class ASRExecutor(BaseExecutor):
         if cfg_path is None or ckpt_path is None:
             sample_rate_str = '16k' if sample_rate == 16000 else '8k'
             tag = model_type + '-' + lang + '-' + sample_rate_str
-            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
-            self.res_path = res_path
+            self.task_resource.set_task_model(tag, version=None)
+            self.res_path = self.task_resource.res_dir
             self.cfg_path = os.path.join(
-                res_path, self.pretrained_models[tag]['cfg_path'])
+                self.res_path, self.task_resource.res_dict['cfg_path'])
             self.ckpt_path = os.path.join(
-                res_path,
-                self.pretrained_models[tag]['ckpt_path'] + ".pdparams")
-            logger.info(res_path)
+                self.res_path,
+                self.task_resource.res_dict['ckpt_path'] + ".pdparams")
+            logger.info(self.res_path)
 
         else:
             self.cfg_path = os.path.abspath(cfg_path)
@@ -172,8 +167,8 @@ class ASRExecutor(BaseExecutor):
                 self.collate_fn_test = SpeechCollator.from_config(self.config)
                 self.text_feature = TextFeaturizer(
                     unit_type=self.config.unit_type, vocab=self.vocab)
-                lm_url = self.pretrained_models[tag]['lm_url']
-                lm_md5 = self.pretrained_models[tag]['lm_md5']
+                lm_url = self.resource.res_dict['lm_url']
+                lm_md5 = self.resource.res_dict['lm_md5']
                 self.download_lm(
                     lm_url,
                     os.path.dirname(self.config.decode.lang_model_path), lm_md5)
@@ -191,7 +186,7 @@ class ASRExecutor(BaseExecutor):
                 raise Exception("wrong type")
         model_name = model_type[:model_type.rindex(
             '_')]  # model_type: {model_name}_{dataset}
-        model_class = dynamic_import(model_name, self.model_alias)
+        model_class = self.task_resource.get_model_class(model_name)
         model_conf = self.config
         model = model_class.from_config(model_conf)
         self.model = model
@@ -438,7 +433,7 @@ class ASRExecutor(BaseExecutor):
         if not parser_args.verbose:
             self.disable_task_loggers()
 
-        task_source = self.get_task_source(parser_args.input)
+        task_source = self.get_input_source(parser_args.input)
         task_results = OrderedDict()
         has_exceptions = False
 
diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py
deleted file mode 100644
index 0f5218840..000000000
--- a/paddlespeech/cli/asr/pretrained_models.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "conformer_wenetspeech-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '76cb19ed857e6623856b7cd7ebbfeda4',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/conformer/checkpoints/wenetspeech',
-    },
-    "conformer_online_wenetspeech-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz',
-        'md5':
-        'b8c02632b04da34aca88459835be54a6',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/chunk_conformer/checkpoints/avg_10',
-    },
-    "conformer_online_multicn-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.0.model.tar.gz',
-        'md5':
-        '7989b3248c898070904cf042fd656003',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/chunk_conformer/checkpoints/multi_cn',
-    },
-    "conformer_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz',
-        'md5':
-        '3f073eccfa7bb14e0c6867d65fc0dc3a',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/conformer/checkpoints/avg_30',
-    },
-    "conformer_online_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz',
-        'md5':
-        'b374cfb93537761270b6224fb0bfc26a',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/chunk_conformer/checkpoints/avg_30',
-    },
-    "transformer_librispeech-en-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '2c667da24922aad391eacafe37bc1660',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/transformer/checkpoints/avg_10',
-    },
-    "deepspeech2online_wenetspeech-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz',
-        'md5':
-        'e393d4d274af0f6967db24fc146e8074',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2_online/checkpoints/avg_10',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "deepspeech2offline_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '932c3593d62fe5c741b59b31318aa314',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2/checkpoints/avg_1',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "deepspeech2online_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
-        'md5':
-        '98b87b171b7240b7cae6e07d8d0bc9be',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2_online/checkpoints/avg_1',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "deepspeech2offline_librispeech-en-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        'f5666c81ad015c8de03aac2bc92e5762',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2/checkpoints/avg_1',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
-        'lm_md5':
-        '099a601759d467cd0a8523ff939819c5'
-    },
-}
-
-model_alias = {
-    "deepspeech2offline":
-    "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
-    "deepspeech2online":
-    "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
-    "conformer":
-    "paddlespeech.s2t.models.u2:U2Model",
-    "conformer_online":
-    "paddlespeech.s2t.models.u2:U2Model",
-    "transformer":
-    "paddlespeech.s2t.models.u2:U2Model",
-    "wenetspeech":
-    "paddlespeech.s2t.models.u2:U2Model",
-}
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
index 4d4d2cc69..39bf24524 100644
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@@ -11,17 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import argparse
 from typing import List
 
+from prettytable import PrettyTable
+
+from ..resource import CommonTaskResource
 from .entry import commands
 from .utils import cli_register
 from .utils import explicit_command_register
 from .utils import get_command
 
-__all__ = [
-    'BaseCommand',
-    'HelpCommand',
-]
+__all__ = ['BaseCommand', 'HelpCommand', 'StatsCommand']
 
 
 @cli_register(name='paddlespeech')
@@ -76,6 +77,59 @@ class VersionCommand:
         return True
 
 
+model_name_format = {
+    'asr': 'Model-Language-Sample Rate',
+    'cls': 'Model-Sample Rate',
+    'st': 'Model-Source language-Target language',
+    'text': 'Model-Task-Language',
+    'tts': 'Model-Language',
+    'vector': 'Model-Sample Rate'
+}
+
+
+@cli_register(
+    name='paddlespeech.stats',
+    description='Get speech tasks support models list.')
+class StatsCommand:
+    def __init__(self):
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech.stats', add_help=True)
+        self.task_choices = ['asr', 'cls', 'st', 'text', 'tts', 'vector']
+        self.parser.add_argument(
+            '--task',
+            type=str,
+            default='asr',
+            choices=self.task_choices,
+            help='Choose speech task.',
+            required=True)
+
+    def show_support_models(self, pretrained_models: dict):
+        fields = model_name_format[self.task].split("-")
+        table = PrettyTable(fields)
+        for key in pretrained_models:
+            table.add_row(key.split("-"))
+        print(table)
+
+    def execute(self, argv: List[str]) -> bool:
+        parser_args = self.parser.parse_args(argv)
+        self.task = parser_args.task
+        if self.task not in self.task_choices:
+            print("Please input correct speech task, choices = " + str(
+                self.task_choices))
+            return
+
+        pretrained_models = CommonTaskResource(task=self.task).pretrained_models
+
+        try:
+            print(
+                "Here is the list of {} pretrained models released by PaddleSpeech that can be used by command line and python API"
+                .format(self.task.upper()))
+            self.show_support_models(pretrained_models)
+        except BaseException:
+            print("Failed to get the list of {} pretrained models.".format(
+                self.task.upper()))
+
+
 # Dynamic import when running specific command
 _commands = {
     'asr': ['Speech to text infer command.', 'ASRExecutor'],
@@ -91,3 +145,4 @@ for com, info in _commands.items():
         name='paddlespeech.{}'.format(com),
         description=info[0],
         cls='paddlespeech.cli.{}.{}'.format(com, info[1]))
+        
\ No newline at end of file
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 3d807b60b..1a9949748 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -21,26 +21,19 @@ from typing import Union
 import numpy as np
 import paddle
 import yaml
-from paddleaudio import load
-from paddleaudio.features import LogMelSpectrogram
-from paddlespeech.utils.dynamic_import import dynamic_import
 
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
-
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram
 
 __all__ = ['CLSExecutor']
 
 
 class CLSExecutor(BaseExecutor):
     def __init__(self):
-        super().__init__()
-        self.model_alias = model_alias
-        self.pretrained_models = pretrained_models
-
+        super().__init__(task='cls')
         self.parser = argparse.ArgumentParser(
             prog='paddlespeech.cls', add_help=True)
         self.parser.add_argument(
@@ -50,7 +43,8 @@ class CLSExecutor(BaseExecutor):
             type=str,
             default='panns_cnn14',
             choices=[
-                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+                tag[:tag.index('-')]
+                for tag in self.task_resource.pretrained_models.keys()
             ],
             help='Choose model type of cls task.')
         self.parser.add_argument(
@@ -103,13 +97,16 @@ class CLSExecutor(BaseExecutor):
 
         if label_file is None or ckpt_path is None:
             tag = model_type + '-' + '32k'  # panns_cnn14-32k
-            self.res_path = self._get_pretrained_path(tag)
+            self.task_resource.set_task_model(tag, version=None)
             self.cfg_path = os.path.join(
-                self.res_path, self.pretrained_models[tag]['cfg_path'])
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['cfg_path'])
             self.label_file = os.path.join(
-                self.res_path, self.pretrained_models[tag]['label_file'])
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['label_file'])
             self.ckpt_path = os.path.join(
-                self.res_path, self.pretrained_models[tag]['ckpt_path'])
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['ckpt_path'])
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.label_file = os.path.abspath(label_file)
@@ -126,7 +123,7 @@ class CLSExecutor(BaseExecutor):
                 self._label_list.append(line.strip())
 
         # model
-        model_class = dynamic_import(model_type, self.model_alias)
+        model_class = self.task_resource.get_model_class(model_type)
         model_dict = paddle.load(self.ckpt_path)
         self.model = model_class(extract_embedding=False)
         self.model.set_state_dict(model_dict)
@@ -203,7 +200,7 @@ class CLSExecutor(BaseExecutor):
         if not parser_args.verbose:
             self.disable_task_loggers()
 
-        task_source = self.get_task_source(parser_args.input)
+        task_source = self.get_input_source(parser_args.input)
         task_results = OrderedDict()
         has_exceptions = False
 
diff --git a/paddlespeech/cli/cls/pretrained_models.py b/paddlespeech/cli/cls/pretrained_models.py
deleted file mode 100644
index 1d66850aa..000000000
--- a/paddlespeech/cli/cls/pretrained_models.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "panns_cnn6-32k": {
-        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
-        'md5': '4cf09194a95df024fd12f84712cf0f9c',
-        'cfg_path': 'panns.yaml',
-        'ckpt_path': 'cnn6.pdparams',
-        'label_file': 'audioset_labels.txt',
-    },
-    "panns_cnn10-32k": {
-        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
-        'md5': 'cb8427b22176cc2116367d14847f5413',
-        'cfg_path': 'panns.yaml',
-        'ckpt_path': 'cnn10.pdparams',
-        'label_file': 'audioset_labels.txt',
-    },
-    "panns_cnn14-32k": {
-        'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
-        'md5': 'e3b9b5614a1595001161d0ab95edee97',
-        'cfg_path': 'panns.yaml',
-        'ckpt_path': 'cnn14.pdparams',
-        'label_file': 'audioset_labels.txt',
-    },
-}
-
-model_alias = {
-    "panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
-    "panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
-    "panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
-}
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index 4a631c7f5..d390f947d 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -24,9 +24,8 @@ from typing import Union
 
 import paddle
 
+from ..resource import CommonTaskResource
 from .log import logger
-from .utils import download_and_decompress
-from .utils import MODEL_HOME
 
 
 class BaseExecutor(ABC):
@@ -34,11 +33,10 @@ class BaseExecutor(ABC):
         An abstract executor of paddlespeech tasks.
     """
 
-    def __init__(self):
+    def __init__(self, task: str, **kwargs):
         self._inputs = OrderedDict()
         self._outputs = OrderedDict()
-        self.pretrained_models = OrderedDict()
-        self.model_alias = OrderedDict()
+        self.task_resource = CommonTaskResource(task=task, **kwargs)
 
     @abstractmethod
     def _init_from_path(self, *args, **kwargs):
@@ -98,8 +96,8 @@ class BaseExecutor(ABC):
         """
         pass
 
-    def get_task_source(self, input_: Union[str, os.PathLike, None]
-                        ) -> Dict[str, Union[str, os.PathLike]]:
+    def get_input_source(self, input_: Union[str, os.PathLike, None]
+                         ) -> Dict[str, Union[str, os.PathLike]]:
         """
         Get task input source from command line input.
 
@@ -115,15 +113,17 @@ class BaseExecutor(ABC):
             ret = OrderedDict()
 
             if input_ is None:  # Take input from stdin
-                for i, line in enumerate(sys.stdin):
-                    line = line.strip()
-                    if len(line.split(' ')) == 1:
-                        ret[str(i + 1)] = line
-                    elif len(line.split(' ')) == 2:
-                        id_, info = line.split(' ')
-                        ret[id_] = info
-                    else:  # No valid input info from one line.
-                        continue
+                if not sys.stdin.isatty(
+                ):  # Avoid getting stuck when stdin is empty.
+                    for i, line in enumerate(sys.stdin):
+                        line = line.strip()
+                        if len(line.split(' ')) == 1:
+                            ret[str(i + 1)] = line
+                        elif len(line.split(' ')) == 2:
+                            id_, info = line.split(' ')
+                            ret[id_] = info
+                        else:  # No valid input info from one line.
+                            continue
             else:
                 ret[1] = input_
         return ret
@@ -219,23 +219,6 @@ class BaseExecutor(ABC):
         for l in loggers:
             l.disabled = True
 
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(self.pretrained_models.keys())
-        assert tag in self.pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(self.pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-
-        return decompressed_path
-
     def show_rtf(self, info: Dict[str, List[float]]):
         """
         Calculate rft of current task and show results.
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index ae188b349..e1ce181af 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -31,21 +31,22 @@ from ..log import logger
 from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
-from .pretrained_models import kaldi_bins
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.utils.utility import UpdateConfig
-from paddlespeech.utils.dynamic_import import dynamic_import
 
 __all__ = ["STExecutor"]
 
+kaldi_bins = {
+    "url":
+    "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
+    "md5":
+    "c0682303b3f3393dbf6ed4c4e35a53eb",
+}
+
 
 class STExecutor(BaseExecutor):
     def __init__(self):
-        super().__init__()
-        self.model_alias = model_alias
-        self.pretrained_models = pretrained_models
+        super().__init__(task='st')
         self.kaldi_bins = kaldi_bins
 
         self.parser = argparse.ArgumentParser(
@@ -57,7 +58,8 @@ class STExecutor(BaseExecutor):
             type=str,
             default="fat_st_ted",
             choices=[
-                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+                tag[:tag.index('-')]
+                for tag in self.task_resource.pretrained_models.keys()
             ],
             help="Choose model type of st task.")
         self.parser.add_argument(
@@ -131,14 +133,16 @@ class STExecutor(BaseExecutor):
 
         if cfg_path is None or ckpt_path is None:
             tag = model_type + "-" + src_lang + "-" + tgt_lang
-            res_path = self._get_pretrained_path(tag)
-            self.cfg_path = os.path.join(res_path,
-                                         pretrained_models[tag]["cfg_path"])
-            self.ckpt_path = os.path.join(res_path,
-                                          pretrained_models[tag]["ckpt_path"])
-            logger.info(res_path)
+            self.task_resource.set_task_model(tag, version=None)
+            self.cfg_path = os.path.join(
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['cfg_path'])
+            self.ckpt_path = os.path.join(
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['ckpt_path'])
             logger.info(self.cfg_path)
             logger.info(self.ckpt_path)
+            res_path = self.task_resource.res_dir
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.ckpt_path = os.path.abspath(ckpt_path)
@@ -163,7 +167,7 @@ class STExecutor(BaseExecutor):
         model_conf = self.config
         model_name = model_type[:model_type.rindex(
             '_')]  # model_type: {model_name}_{dataset}
-        model_class = dynamic_import(model_name, self.model_alias)
+        model_class = self.task_resource.get_model_class(model_name)
         self.model = model_class.from_config(model_conf)
         self.model.eval()
 
@@ -301,7 +305,7 @@ class STExecutor(BaseExecutor):
         if not parser_args.verbose:
             self.disable_task_loggers()
 
-        task_source = self.get_task_source(parser_args.input)
+        task_source = self.get_input_source(parser_args.input)
         task_results = OrderedDict()
         has_exceptions = False
 
diff --git a/paddlespeech/cli/st/pretrained_models.py b/paddlespeech/cli/st/pretrained_models.py
deleted file mode 100644
index cc7410d25..000000000
--- a/paddlespeech/cli/st/pretrained_models.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
-    "fat_st_ted-en-zh": {
-        "url":
-        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
-        "md5":
-        "d62063f35a16d91210a71081bd2dd557",
-        "cfg_path":
-        "model.yaml",
-        "ckpt_path":
-        "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
-    }
-}
-
-model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
-
-kaldi_bins = {
-    "url":
-    "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
-    "md5":
-    "c0682303b3f3393dbf6ed4c4e35a53eb",
-}
diff --git a/paddlespeech/cli/stats/infer.py b/paddlespeech/cli/stats/infer.py
deleted file mode 100644
index 7cf4f2368..000000000
--- a/paddlespeech/cli/stats/infer.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-from typing import List
-
-from prettytable import PrettyTable
-
-from ..utils import cli_register
-from ..utils import stats_wrapper
-
-__all__ = ['StatsExecutor']
-
-model_name_format = {
-    'asr': 'Model-Language-Sample Rate',
-    'cls': 'Model-Sample Rate',
-    'st': 'Model-Source language-Target language',
-    'text': 'Model-Task-Language',
-    'tts': 'Model-Language',
-    'vector': 'Model-Sample Rate'
-}
-
-
-@cli_register(
-    name='paddlespeech.stats',
-    description='Get speech tasks support models list.')
-class StatsExecutor():
-    def __init__(self):
-        super().__init__()
-
-        self.parser = argparse.ArgumentParser(
-            prog='paddlespeech.stats', add_help=True)
-        self.task_choices = ['asr', 'cls', 'st', 'text', 'tts', 'vector']
-        self.parser.add_argument(
-            '--task',
-            type=str,
-            default='asr',
-            choices=self.task_choices,
-            help='Choose speech task.',
-            required=True)
-
-    def show_support_models(self, pretrained_models: dict):
-        fields = model_name_format[self.task].split("-")
-        table = PrettyTable(fields)
-        for key in pretrained_models:
-            table.add_row(key.split("-"))
-        print(table)
-
-    def execute(self, argv: List[str]) -> bool:
-        """
-            Command line entry.
-        """
-        parser_args = self.parser.parse_args(argv)
-        has_exceptions = False
-        try:
-            self(parser_args.task)
-        except Exception as e:
-            has_exceptions = True
-        if has_exceptions:
-            return False
-        else:
-            return True
-
-    @stats_wrapper
-    def __call__(
-            self,
-            task: str=None, ):
-        """
-            Python API to call an executor.
-        """
-        self.task = task
-        if self.task not in self.task_choices:
-            print("Please input correct speech task, choices = " + str(
-                self.task_choices))
-
-        elif self.task == 'asr':
-            try:
-                from ..asr.pretrained_models import pretrained_models
-                print(
-                    "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-            except BaseException:
-                print("Failed to get the list of ASR pretrained models.")
-
-        elif self.task == 'cls':
-            try:
-                from ..cls.pretrained_models import pretrained_models
-                print(
-                    "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-            except BaseException:
-                print("Failed to get the list of CLS pretrained models.")
-
-        elif self.task == 'st':
-            try:
-                from ..st.pretrained_models import pretrained_models
-                print(
-                    "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-            except BaseException:
-                print("Failed to get the list of ST pretrained models.")
-
-        elif self.task == 'text':
-            try:
-                from ..text.pretrained_models import pretrained_models
-                print(
-                    "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-            except BaseException:
-                print("Failed to get the list of TEXT pretrained models.")
-
-        elif self.task == 'tts':
-            try:
-                from ..tts.pretrained_models import pretrained_models
-                print(
-                    "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-            except BaseException:
-                print("Failed to get the list of TTS pretrained models.")
-
-        elif self.task == 'vector':
-            try:
-                from ..vector.pretrained_models import pretrained_models
-                print(
-                    "Here is the list of Speaker Recognition pretrained models released by PaddleSpeech that can be used by command line and python API"
-                )
-                self.show_support_models(pretrained_models)
-            except BaseException:
-                print(
-                    "Failed to get the list of Speaker Recognition pretrained models."
-                )
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index be5b5a10d..7b8faf99c 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -24,21 +24,13 @@ import paddle
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
-from .pretrained_models import tokenizer_alias
-from paddlespeech.utils.dynamic_import import dynamic_import
 
 __all__ = ['TextExecutor']
 
 
 class TextExecutor(BaseExecutor):
     def __init__(self):
-        super().__init__()
-        self.model_alias = model_alias
-        self.pretrained_models = pretrained_models
-        self.tokenizer_alias = tokenizer_alias
-
+        super().__init__(task='text')
         self.parser = argparse.ArgumentParser(
             prog='paddlespeech.text', add_help=True)
         self.parser.add_argument(
@@ -54,7 +46,8 @@ class TextExecutor(BaseExecutor):
             type=str,
             default='ernie_linear_p7_wudao',
             choices=[
-                tag[:tag.index('-')] for tag in self.pretrained_models.keys()
+                tag[:tag.index('-')]
+                for tag in self.task_resource.pretrained_models.keys()
             ],
             help='Choose model type of text task.')
         self.parser.add_argument(
@@ -112,13 +105,16 @@ class TextExecutor(BaseExecutor):
 
         if cfg_path is None or ckpt_path is None or vocab_file is None:
             tag = '-'.join([model_type, task, lang])
-            self.res_path = self._get_pretrained_path(tag)
+            self.task_resource.set_task_model(tag, version=None)
             self.cfg_path = os.path.join(
-                self.res_path, self.pretrained_models[tag]['cfg_path'])
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['cfg_path'])
             self.ckpt_path = os.path.join(
-                self.res_path, self.pretrained_models[tag]['ckpt_path'])
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['ckpt_path'])
             self.vocab_file = os.path.join(
-                self.res_path, self.pretrained_models[tag]['vocab_file'])
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['vocab_file'])
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.ckpt_path = os.path.abspath(ckpt_path)
@@ -133,8 +129,8 @@ class TextExecutor(BaseExecutor):
                     self._punc_list.append(line.strip())
 
             # model
-            model_class = dynamic_import(model_name, self.model_alias)
-            tokenizer_class = dynamic_import(model_name, self.tokenizer_alias)
+            model_class, tokenizer_class = self.task_resource.get_model_class(
+                model_name)
             self.model = model_class(
                 cfg_path=self.cfg_path, ckpt_path=self.ckpt_path)
             self.tokenizer = tokenizer_class.from_pretrained('ernie-1.0')
@@ -224,7 +220,7 @@ class TextExecutor(BaseExecutor):
         if not parser_args.verbose:
             self.disable_task_loggers()
 
-        task_source = self.get_task_source(parser_args.input)
+        task_source = self.get_input_source(parser_args.input)
         task_results = OrderedDict()
         has_exceptions = False
 
diff --git a/paddlespeech/cli/text/pretrained_models.py b/paddlespeech/cli/text/pretrained_models.py
deleted file mode 100644
index 817d3caa3..000000000
--- a/paddlespeech/cli/text/pretrained_models.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "ernie_linear_p7_wudao-punc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
-        'md5':
-        '12283e2ddde1797c5d1e57036b512746',
-        'cfg_path':
-        'ckpt/model_config.json',
-        'ckpt_path':
-        'ckpt/model_state.pdparams',
-        'vocab_file':
-        'punc_vocab.txt',
-    },
-    "ernie_linear_p3_wudao-punc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
-        'md5':
-        '448eb2fdf85b6a997e7e652e80c51dd2',
-        'cfg_path':
-        'ckpt/model_config.json',
-        'ckpt_path':
-        'ckpt/model_state.pdparams',
-        'vocab_file':
-        'punc_vocab.txt',
-    },
-}
-
-model_alias = {
-    "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
-    "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
-}
-
-tokenizer_alias = {
-    "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
-    "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
-}
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 879d4a4db..4e0337bcc 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -29,22 +29,16 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
-from paddlespeech.utils.dynamic_import import dynamic_import
 
 __all__ = ['TTSExecutor']
 
 
 class TTSExecutor(BaseExecutor):
     def __init__(self):
-        super().__init__()
-        self.model_alias = model_alias
-        self.pretrained_models = pretrained_models
-
+        super().__init__('tts')
         self.parser = argparse.ArgumentParser(
             prog='paddlespeech.tts', add_help=True)
         self.parser.add_argument(
@@ -183,19 +177,23 @@ class TTSExecutor(BaseExecutor):
             return
         # am
         am_tag = am + '-' + lang
+        self.task_resource.set_task_model(
+            model_tag=am_tag,
+            model_type=0,  # am
+            version=None,  # default version
+        )
         if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
-            am_res_path = self._get_pretrained_path(am_tag)
-            self.am_res_path = am_res_path
-            self.am_config = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['config'])
-            self.am_ckpt = os.path.join(am_res_path,
-                                        self.pretrained_models[am_tag]['ckpt'])
+            self.am_res_path = self.task_resource.res_dir
+            self.am_config = os.path.join(self.am_res_path,
+                                          self.task_resource.res_dict['config'])
+            self.am_ckpt = os.path.join(self.am_res_path,
+                                        self.task_resource.res_dict['ckpt'])
             self.am_stat = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['speech_stats'])
+                self.am_res_path, self.task_resource.res_dict['speech_stats'])
             # must have phones_dict in acoustic
             self.phones_dict = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
-            logger.info(am_res_path)
+                self.am_res_path, self.task_resource.res_dict['phones_dict'])
+            logger.info(self.am_res_path)
             logger.info(self.am_config)
             logger.info(self.am_ckpt)
         else:
@@ -207,33 +205,37 @@ class TTSExecutor(BaseExecutor):
 
         # for speedyspeech
         self.tones_dict = None
-        if 'tones_dict' in self.pretrained_models[am_tag]:
+        if 'tones_dict' in self.task_resource.res_dict:
             self.tones_dict = os.path.join(
-                self.am_res_path, self.pretrained_models[am_tag]['tones_dict'])
+                self.am_res_path, self.task_resource.res_dict['tones_dict'])
             if tones_dict:
                 self.tones_dict = tones_dict
 
         # for multi speaker fastspeech2
         self.speaker_dict = None
-        if 'speaker_dict' in self.pretrained_models[am_tag]:
+        if 'speaker_dict' in self.task_resource.res_dict:
             self.speaker_dict = os.path.join(
-                self.am_res_path,
-                self.pretrained_models[am_tag]['speaker_dict'])
+                self.am_res_path, self.task_resource.res_dict['speaker_dict'])
             if speaker_dict:
                 self.speaker_dict = speaker_dict
 
         # voc
         voc_tag = voc + '-' + lang
+        self.task_resource.set_task_model(
+            model_tag=voc_tag,
+            model_type=1,  # vocoder
+            version=None,  # default version
+        )
         if voc_ckpt is None or voc_config is None or voc_stat is None:
-            voc_res_path = self._get_pretrained_path(voc_tag)
-            self.voc_res_path = voc_res_path
+            self.voc_res_path = self.task_resource.voc_res_dir
             self.voc_config = os.path.join(
-                voc_res_path, self.pretrained_models[voc_tag]['config'])
+                self.voc_res_path, self.task_resource.voc_res_dict['config'])
             self.voc_ckpt = os.path.join(
-                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
+                self.voc_res_path, self.task_resource.voc_res_dict['ckpt'])
             self.voc_stat = os.path.join(
-                voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
-            logger.info(voc_res_path)
+                self.voc_res_path,
+                self.task_resource.voc_res_dict['speech_stats'])
+            logger.info(self.voc_res_path)
             logger.info(self.voc_config)
             logger.info(self.voc_ckpt)
         else:
@@ -283,9 +285,9 @@ class TTSExecutor(BaseExecutor):
         # model: {model_name}_{dataset}
         am_name = am[:am.rindex('_')]
 
-        am_class = dynamic_import(am_name, self.model_alias)
-        am_inference_class = dynamic_import(am_name + '_inference',
-                                            self.model_alias)
+        am_class = self.task_resource.get_model_class(am_name)
+        am_inference_class = self.task_resource.get_model_class(am_name +
+                                                                '_inference')
 
         if am_name == 'fastspeech2':
             am = am_class(
@@ -314,9 +316,9 @@ class TTSExecutor(BaseExecutor):
         # vocoder
         # model: {model_name}_{dataset}
         voc_name = voc[:voc.rindex('_')]
-        voc_class = dynamic_import(voc_name, self.model_alias)
-        voc_inference_class = dynamic_import(voc_name + '_inference',
-                                             self.model_alias)
+        voc_class = self.task_resource.get_model_class(voc_name)
+        voc_inference_class = self.task_resource.get_model_class(voc_name +
+                                                                 '_inference')
         if voc_name != 'wavernn':
             voc = voc_class(**self.voc_config["generator_params"])
             voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"])
@@ -444,7 +446,7 @@ class TTSExecutor(BaseExecutor):
         if not args.verbose:
             self.disable_task_loggers()
 
-        task_source = self.get_task_source(args.input)
+        task_source = self.get_input_source(args.input)
         task_results = OrderedDict()
         has_exceptions = False
 
diff --git a/paddlespeech/cli/tts/pretrained_models.py b/paddlespeech/cli/tts/pretrained_models.py
deleted file mode 100644
index 65254a935..000000000
--- a/paddlespeech/cli/tts/pretrained_models.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
-    # speedyspeech
-    "speedyspeech_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
-        'md5':
-        '6f6fa967b408454b6662c8c00c0027cb',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_30600.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'tones_dict':
-        'tone_id_map.txt',
-    },
-
-    # fastspeech2
-    "fastspeech2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
-        'md5':
-        '637d28a5e53aa60275612ba4393d5f22',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_76000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "fastspeech2_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
-        'md5':
-        'ffed800c93deaf16ca9b3af89bfcd747',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_100000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "fastspeech2_aishell3-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
-        'md5':
-        'f4dd4a5f49a4552b77981f544ab3392e',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_96400.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'speaker_dict':
-        'speaker_id_map.txt',
-    },
-    "fastspeech2_vctk-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
-        'md5':
-        '743e5024ca1e17a88c5c271db9779ba4',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_66200.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'speaker_dict':
-        'speaker_id_map.txt',
-    },
-    # tacotron2
-    "tacotron2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
-        'md5':
-        '0df4b6f0bcbe0d73c5ed6df8867ab91a',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_30600.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "tacotron2_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
-        'md5':
-        '6a5eddd81ae0e81d16959b97481135f3',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_60300.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-
-    # pwgan
-    "pwgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
-        'md5':
-        '2e481633325b5bdf0a3823c714d2c117',
-        'config':
-        'pwg_default.yaml',
-        'ckpt':
-        'pwg_snapshot_iter_400000.pdz',
-        'speech_stats':
-        'pwg_stats.npy',
-    },
-    "pwgan_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
-        'md5':
-        '53610ba9708fd3008ccaf8e99dacbaf0',
-        'config':
-        'pwg_default.yaml',
-        'ckpt':
-        'pwg_snapshot_iter_400000.pdz',
-        'speech_stats':
-        'pwg_stats.npy',
-    },
-    "pwgan_aishell3-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
-        'md5':
-        'd7598fa41ad362d62f85ffc0f07e3d84',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1000000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "pwgan_vctk-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
-        'md5':
-        'b3da1defcde3e578be71eb284cb89f2c',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # mb_melgan
-    "mb_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'ee5f0604e20091f0d495b6ec4618b90d',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1000000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # style_melgan
-    "style_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        '5de2d5348f396de0c966926b8c462755',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    # hifigan
-    "hifigan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'dd40a3d88dfcf64513fba2f0f961ada6',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "hifigan_ljspeech-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
-        'md5':
-        '70e9131695decbca06a65fe51ed38a72',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "hifigan_aishell3-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
-        'md5':
-        '3bb49bc75032ed12f79c00c8cc79a09a',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-    "hifigan_vctk-en": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
-        'md5':
-        '7da8f88359bca2457e705d924cf27bd4',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-
-    # wavernn
-    "wavernn_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
-        'md5':
-        'ee37b752f09bcba8f2af3b777ca38e13',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_400000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    }
-}
-
-model_alias = {
-    # acoustic model
-    "speedyspeech":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
-    "speedyspeech_inference":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-    "tacotron2":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2",
-    "tacotron2_inference":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
-    # voc
-    "pwgan":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
-    "pwgan_inference":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
-    "mb_melgan":
-    "paddlespeech.t2s.models.melgan:MelGANGenerator",
-    "mb_melgan_inference":
-    "paddlespeech.t2s.models.melgan:MelGANInference",
-    "style_melgan":
-    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
-    "style_melgan_inference":
-    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
-    "hifigan":
-    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
-    "hifigan_inference":
-    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
-    "wavernn":
-    "paddlespeech.t2s.models.wavernn:WaveRNN",
-    "wavernn_inference":
-    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
-}
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 07fb73a4c..8bf090013 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -22,26 +22,20 @@ from typing import Union
 
 import paddle
 import soundfile
-from paddleaudio.backends import load as load_audio
-from paddleaudio.compliance.librosa import melspectrogram
 from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from .pretrained_models import model_alias
-from .pretrained_models import pretrained_models
-from paddlespeech.utils.dynamic_import import dynamic_import
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 
 
 class VectorExecutor(BaseExecutor):
     def __init__(self):
-        super().__init__()
-        self.model_alias = model_alias
-        self.pretrained_models = pretrained_models
-
+        super().__init__('vector')
         self.parser = argparse.ArgumentParser(
             prog="paddlespeech.vector", add_help=True)
 
@@ -49,7 +43,10 @@ class VectorExecutor(BaseExecutor):
             "--model",
             type=str,
             default="ecapatdnn_voxceleb12",
-            choices=["ecapatdnn_voxceleb12"],
+            choices=[
+                tag[:tag.index('-')]
+                for tag in self.task_resource.pretrained_models.keys()
+            ],
             help="Choose model type of vector task.")
         self.parser.add_argument(
             "--task",
@@ -119,7 +116,7 @@ class VectorExecutor(BaseExecutor):
             self.disable_task_loggers()
 
         # stage 2: read the input data and store them as a list
-        task_source = self.get_task_source(parser_args.input)
+        task_source = self.get_input_source(parser_args.input)
         logger.info(f"task source: {task_source}")
 
         # stage 3: process the audio one by one
@@ -296,6 +293,7 @@ class VectorExecutor(BaseExecutor):
             # get the mode from pretrained list
             sample_rate_str = "16k" if sample_rate == 16000 else "8k"
             tag = model_type + "-" + sample_rate_str
+            self.task_resource.set_task_model(tag, version=None)
             logger.info(f"load the pretrained model: {tag}")
             # get the model from the pretrained list
             # we download the pretrained model and store it in the res_path
@@ -303,10 +301,11 @@ class VectorExecutor(BaseExecutor):
             self.res_path = res_path
 
             self.cfg_path = os.path.join(
-                res_path, self.pretrained_models[tag]['cfg_path'])
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['cfg_path'])
             self.ckpt_path = os.path.join(
-                res_path,
-                self.pretrained_models[tag]['ckpt_path'] + '.pdparams')
+                self.task_resource.res_dir,
+                self.task_resource.res_dict['ckpt_path'] + '.pdparams')
         else:
             # get the model from disk
             self.cfg_path = os.path.abspath(cfg_path)
@@ -325,8 +324,8 @@ class VectorExecutor(BaseExecutor):
         # stage 3: get the model name to instance the model network with dynamic_import
         logger.info("start to dynamic import the model class")
         model_name = model_type[:model_type.rindex('_')]
+        model_class = self.task_resource.get_model_class(model_name)
         logger.info(f"model name {model_name}")
-        model_class = dynamic_import(model_name, self.model_alias)
         model_conf = self.config.model
         backbone = model_class(**model_conf)
         model = SpeakerIdetification(
diff --git a/paddlespeech/cli/vector/pretrained_models.py b/paddlespeech/cli/vector/pretrained_models.py
deleted file mode 100644
index 4d1d3a048..000000000
--- a/paddlespeech/cli/vector/pretrained_models.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
-    # e.g. "ecapatdnn_voxceleb12-16k".
-    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
-    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
-    "ecapatdnn_voxceleb12-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_1.tar.gz',
-        'md5':
-        '67c7ff8885d5246bd16e0f5ac1cba99f',
-        'cfg_path':
-        'conf/model.yaml',  # the yaml config path
-        'ckpt_path':
-        'model/model',  # the format is ${dir}/{model_name}, 
-        # so the first 'model' is dir, the second 'model' is the name
-        # this means we have a model stored as model/model.pdparams
-    },
-}
-
-model_alias = {
-    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
-}
diff --git a/paddlespeech/cli/stats/__init__.py b/paddlespeech/resource/__init__.py
similarity index 83%
rename from paddlespeech/cli/stats/__init__.py
rename to paddlespeech/resource/__init__.py
index 9fe6c4aba..e143413af 100644
--- a/paddlespeech/cli/stats/__init__.py
+++ b/paddlespeech/resource/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,4 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .infer import StatsExecutor
+from .resource import CommonTaskResource
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
new file mode 100644
index 000000000..9441a2805
--- /dev/null
+++ b/paddlespeech/resource/pretrained_models.py
@@ -0,0 +1,822 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'asr_dynamic_pretrained_models',
+    'asr_static_pretrained_models',
+    'cls_dynamic_pretrained_models',
+    'cls_static_pretrained_models',
+    'st_dynamic_pretrained_models',
+    'st_kaldi_bins',
+    'text_dynamic_pretrained_models',
+    'tts_dynamic_pretrained_models',
+    'tts_static_pretrained_models',
+    'tts_onnx_pretrained_models',
+    'vector_dynamic_pretrained_models',
+]
+
+# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+# e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
+# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+
+# ---------------------------------
+# -------------- ASR --------------
+# ---------------------------------
+asr_dynamic_pretrained_models = {
+    "conformer_wenetspeech-zh-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
+            'md5':
+            '76cb19ed857e6623856b7cd7ebbfeda4',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/conformer/checkpoints/wenetspeech',
+        },
+    },
+    "conformer_online_wenetspeech-zh-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz',
+            'md5':
+            'b8c02632b04da34aca88459835be54a6',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/chunk_conformer/checkpoints/avg_10',
+        },
+    },
+    "conformer_online_multicn-zh-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.0.model.tar.gz',
+            'md5':
+            '7989b3248c898070904cf042fd656003',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/chunk_conformer/checkpoints/multi_cn',
+        },
+        '2.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
+            'md5':
+            '0ac93d390552336f2a906aec9e33c5fa',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/chunk_conformer/checkpoints/multi_cn',
+            'lm_url':
+            'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+            'lm_md5':
+            '29e02312deb2e59b3c8686c7966d4fe3'
+        },
+    },
+    "conformer_aishell-zh-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz',
+            'md5':
+            '3f073eccfa7bb14e0c6867d65fc0dc3a',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/conformer/checkpoints/avg_30',
+        },
+    },
+    "conformer_online_aishell-zh-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz',
+            'md5':
+            'b374cfb93537761270b6224fb0bfc26a',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/chunk_conformer/checkpoints/avg_30',
+        },
+    },
+    "transformer_librispeech-en-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
+            'md5':
+            '2c667da24922aad391eacafe37bc1660',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/transformer/checkpoints/avg_10',
+        },
+    },
+    "deepspeech2online_wenetspeech-zh-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz',
+            'md5':
+            'e393d4d274af0f6967db24fc146e8074',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/deepspeech2_online/checkpoints/avg_10',
+            'lm_url':
+            'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+            'lm_md5':
+            '29e02312deb2e59b3c8686c7966d4fe3'
+        },
+    },
+    "deepspeech2offline_aishell-zh-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+            'md5':
+            '932c3593d62fe5c741b59b31318aa314',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/deepspeech2/checkpoints/avg_1',
+            'lm_url':
+            'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+            'lm_md5':
+            '29e02312deb2e59b3c8686c7966d4fe3'
+        },
+    },
+    "deepspeech2online_aishell-zh-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
+            'md5':
+            '98b87b171b7240b7cae6e07d8d0bc9be',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/deepspeech2_online/checkpoints/avg_1',
+            'lm_url':
+            'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+            'lm_md5':
+            '29e02312deb2e59b3c8686c7966d4fe3'
+        },
+    },
+    "deepspeech2offline_librispeech-en-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
+            'md5':
+            'f5666c81ad015c8de03aac2bc92e5762',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/deepspeech2/checkpoints/avg_1',
+            'lm_url':
+            'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
+            'lm_md5':
+            '099a601759d467cd0a8523ff939819c5'
+        },
+    },
+}
+
+asr_static_pretrained_models = {
+    "deepspeech2offline_aishell-zh-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+            'md5':
+            '932c3593d62fe5c741b59b31318aa314',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/deepspeech2/checkpoints/avg_1',
+            'model':
+            'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
+            'params':
+            'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
+            'lm_url':
+            'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+            'lm_md5':
+            '29e02312deb2e59b3c8686c7966d4fe3'
+        }
+    },
+}
+
+# ---------------------------------
+# -------------- CLS --------------
+# ---------------------------------
+cls_dynamic_pretrained_models = {
+    "panns_cnn6-32k": {
+        '1.0': {
+            'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
+            'md5': '4cf09194a95df024fd12f84712cf0f9c',
+            'cfg_path': 'panns.yaml',
+            'ckpt_path': 'cnn6.pdparams',
+            'label_file': 'audioset_labels.txt',
+        },
+    },
+    "panns_cnn10-32k": {
+        '1.0': {
+            'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
+            'md5': 'cb8427b22176cc2116367d14847f5413',
+            'cfg_path': 'panns.yaml',
+            'ckpt_path': 'cnn10.pdparams',
+            'label_file': 'audioset_labels.txt',
+        },
+    },
+    "panns_cnn14-32k": {
+        '1.0': {
+            'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
+            'md5': 'e3b9b5614a1595001161d0ab95edee97',
+            'cfg_path': 'panns.yaml',
+            'ckpt_path': 'cnn14.pdparams',
+            'label_file': 'audioset_labels.txt',
+        },
+    },
+}
+
+cls_static_pretrained_models = {
+    "panns_cnn6-32k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
+            'md5':
+            'da087c31046d23281d8ec5188c1967da',
+            'cfg_path':
+            'panns.yaml',
+            'model_path':
+            'inference.pdmodel',
+            'params_path':
+            'inference.pdiparams',
+            'label_file':
+            'audioset_labels.txt',
+        },
+    },
+    "panns_cnn10-32k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
+            'md5':
+            '5460cc6eafbfaf0f261cc75b90284ae1',
+            'cfg_path':
+            'panns.yaml',
+            'model_path':
+            'inference.pdmodel',
+            'params_path':
+            'inference.pdiparams',
+            'label_file':
+            'audioset_labels.txt',
+        },
+    },
+    "panns_cnn14-32k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
+            'md5':
+            'ccc80b194821274da79466862b2ab00f',
+            'cfg_path':
+            'panns.yaml',
+            'model_path':
+            'inference.pdmodel',
+            'params_path':
+            'inference.pdiparams',
+            'label_file':
+            'audioset_labels.txt',
+        },
+    },
+}
+
+# ---------------------------------
+# -------------- ST ---------------
+# ---------------------------------
+st_dynamic_pretrained_models = {
+    "fat_st_ted-en-zh": {
+        '1.0': {
+            "url":
+            "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
+            "md5":
+            "d62063f35a16d91210a71081bd2dd557",
+            "cfg_path":
+            "model.yaml",
+            "ckpt_path":
+            "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
+        },
+    },
+}
+
+st_kaldi_bins = {
+    "url":
+    "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
+    "md5":
+    "c0682303b3f3393dbf6ed4c4e35a53eb",
+}
+
+# ---------------------------------
+# -------------- TEXT -------------
+# ---------------------------------
+text_dynamic_pretrained_models = {
+    "ernie_linear_p7_wudao-punc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
+            'md5':
+            '12283e2ddde1797c5d1e57036b512746',
+            'cfg_path':
+            'ckpt/model_config.json',
+            'ckpt_path':
+            'ckpt/model_state.pdparams',
+            'vocab_file':
+            'punc_vocab.txt',
+        },
+    },
+    "ernie_linear_p3_wudao-punc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
+            'md5':
+            '448eb2fdf85b6a997e7e652e80c51dd2',
+            'cfg_path':
+            'ckpt/model_config.json',
+            'ckpt_path':
+            'ckpt/model_state.pdparams',
+            'vocab_file':
+            'punc_vocab.txt',
+        },
+    },
+}
+
+# ---------------------------------
+# -------------- TTS --------------
+# ---------------------------------
+tts_dynamic_pretrained_models = {
+    # speedyspeech
+    "speedyspeech_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
+            'md5':
+            '6f6fa967b408454b6662c8c00c0027cb',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_30600.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+            'tones_dict':
+            'tone_id_map.txt',
+        },
+    },
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
+            'md5':
+            '637d28a5e53aa60275612ba4393d5f22',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_76000.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+        },
+    },
+    "fastspeech2_ljspeech-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
+            'md5':
+            'ffed800c93deaf16ca9b3af89bfcd747',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_100000.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+        },
+    },
+    "fastspeech2_aishell3-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
+            'md5':
+            'f4dd4a5f49a4552b77981f544ab3392e',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_96400.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+        },
+    },
+    "fastspeech2_vctk-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
+            'md5':
+            '743e5024ca1e17a88c5c271db9779ba4',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_66200.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+        },
+    },
+    # tacotron2
+    "tacotron2_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
+            'md5':
+            '0df4b6f0bcbe0d73c5ed6df8867ab91a',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_30600.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+        },
+    },
+    "tacotron2_ljspeech-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
+            'md5':
+            '6a5eddd81ae0e81d16959b97481135f3',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_60300.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+        },
+    },
+    # pwgan
+    "pwgan_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
+            'md5':
+            '2e481633325b5bdf0a3823c714d2c117',
+            'config':
+            'pwg_default.yaml',
+            'ckpt':
+            'pwg_snapshot_iter_400000.pdz',
+            'speech_stats':
+            'pwg_stats.npy',
+        },
+    },
+    "pwgan_ljspeech-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
+            'md5':
+            '53610ba9708fd3008ccaf8e99dacbaf0',
+            'config':
+            'pwg_default.yaml',
+            'ckpt':
+            'pwg_snapshot_iter_400000.pdz',
+            'speech_stats':
+            'pwg_stats.npy',
+        },
+    },
+    "pwgan_aishell3-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
+            'md5':
+            'd7598fa41ad362d62f85ffc0f07e3d84',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_1000000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
+    "pwgan_vctk-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
+            'md5':
+            'b3da1defcde3e578be71eb284cb89f2c',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_1500000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
+            'md5':
+            'ee5f0604e20091f0d495b6ec4618b90d',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_1000000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
+    # style_melgan
+    "style_melgan_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
+            'md5':
+            '5de2d5348f396de0c966926b8c462755',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_1500000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
+            'md5':
+            'dd40a3d88dfcf64513fba2f0f961ada6',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_2500000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
+    "hifigan_ljspeech-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
+            'md5':
+            '70e9131695decbca06a65fe51ed38a72',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_2500000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
+    "hifigan_aishell3-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
+            'md5':
+            '3bb49bc75032ed12f79c00c8cc79a09a',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_2500000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
+    "hifigan_vctk-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
+            'md5':
+            '7da8f88359bca2457e705d924cf27bd4',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_2500000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
+    # wavernn
+    "wavernn_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
+            'md5':
+            'ee37b752f09bcba8f2af3b777ca38e13',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_400000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
+    "fastspeech2_cnndecoder_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
+            'md5':
+            '6eb28e22ace73e0ebe7845f86478f89f',
+            'config':
+            'cnndecoder.yaml',
+            'ckpt':
+            'snapshot_iter_153000.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+        },
+    },
+}
+
+tts_static_pretrained_models = {
+    # speedyspeech
+    "speedyspeech_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
+            'md5':
+            'f10cbdedf47dc7a9668d2264494e1823',
+            'model':
+            'speedyspeech_csmsc.pdmodel',
+            'params':
+            'speedyspeech_csmsc.pdiparams',
+            'phones_dict':
+            'phone_id_map.txt',
+            'tones_dict':
+            'tone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    # fastspeech2
+    "fastspeech2_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
+            'md5':
+            '9788cd9745e14c7a5d12d32670b2a5a7',
+            'model':
+            'fastspeech2_csmsc.pdmodel',
+            'params':
+            'fastspeech2_csmsc.pdiparams',
+            'phones_dict':
+            'phone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    # pwgan
+    "pwgan_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
+            'md5':
+            'e3504aed9c5a290be12d1347836d2742',
+            'model':
+            'pwgan_csmsc.pdmodel',
+            'params':
+            'pwgan_csmsc.pdiparams',
+            'sample_rate':
+            24000,
+        },
+    },
+    # mb_melgan
+    "mb_melgan_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
+            'md5':
+            'ac6eee94ba483421d750433f4c3b8d36',
+            'model':
+            'mb_melgan_csmsc.pdmodel',
+            'params':
+            'mb_melgan_csmsc.pdiparams',
+            'sample_rate':
+            24000,
+        },
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
+            'md5':
+            '7edd8c436b3a5546b3a7cb8cff9d5a0c',
+            'model':
+            'hifigan_csmsc.pdmodel',
+            'params':
+            'hifigan_csmsc.pdiparams',
+            'sample_rate':
+            24000,
+        },
+    },
+}
+
+tts_onnx_pretrained_models = {
+    # fastspeech2
+    "fastspeech2_csmsc_onnx-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
+            'md5':
+            'fd3ad38d83273ad51f0ea4f4abf3ab4e',
+            'ckpt': ['fastspeech2_csmsc.onnx'],
+            'phones_dict':
+            'phone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    "fastspeech2_cnndecoder_csmsc_onnx-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
+            'md5':
+            '5f70e1a6bcd29d72d54e7931aa86f266',
+            'ckpt': [
+                'fastspeech2_csmsc_am_encoder_infer.onnx',
+                'fastspeech2_csmsc_am_decoder.onnx',
+                'fastspeech2_csmsc_am_postnet.onnx',
+            ],
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    # mb_melgan
+    "mb_melgan_csmsc_onnx-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
+            'md5':
+            '5b83ec746e8414bc29032d954ffd07ec',
+            'ckpt':
+            'mb_melgan_csmsc.onnx',
+            'sample_rate':
+            24000,
+        },
+    },
+    # hifigan
+    "hifigan_csmsc_onnx-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
+            'md5':
+            '1a7dc0385875889e46952e50c0994a6b',
+            'ckpt':
+            'hifigan_csmsc.onnx',
+            'sample_rate':
+            24000,
+        },
+    },
+}
+
+# ---------------------------------
+# ------------ Vector -------------
+# ---------------------------------
+vector_dynamic_pretrained_models = {
+    "ecapatdnn_voxceleb12-16k": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
+            'md5':
+            'cc33023c54ab346cd318408f43fcaf95',
+            'cfg_path':
+            'conf/model.yaml',  # the yaml config path
+            'ckpt_path':
+            'model/model',  # the format is ${dir}/{model_name}, 
+            # so the first 'model' is dir, the second 'model' is the name
+            # this means we have a model stored as model/model.pdparams
+        },
+    },
+}
diff --git a/paddlespeech/resource/resource.py b/paddlespeech/resource/resource.py
new file mode 100644
index 000000000..f00b1b3b0
--- /dev/null
+++ b/paddlespeech/resource/resource.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from collections import OrderedDict
+from typing import Dict
+from typing import List
+from typing import Optional
+
+from ..cli.utils import download_and_decompress
+from ..cli.utils import MODEL_HOME
+from ..utils.dynamic_import import dynamic_import
+from .model_alias import model_alias
+
+task_supported = ['asr', 'cls', 'st', 'text', 'tts', 'vector']
+model_format_supported = ['dynamic', 'static', 'onnx']
+inference_mode_supported = ['online', 'offline']
+
+
+class CommonTaskResource:
+    def __init__(self, task: str, model_format: str='dynamic', **kwargs):
+        assert task in task_supported, 'Arg "task" must be one of {}.'.format(
+            task_supported)
+        assert model_format in model_format_supported, 'Arg "model_format" must be one of {}.'.format(
+            model_format_supported)
+
+        self.task = task
+        self.model_format = model_format
+        self.pretrained_models = self._get_pretrained_models()
+
+        if 'inference_mode' in kwargs:
+            assert kwargs[
+                'inference_mode'] in inference_mode_supported, 'Arg "inference_mode" must be one of {}.'.format(
+                    inference_mode_supported)
+            self._inference_mode_filter(kwargs['inference_mode'])
+
+        # Initialize after model and version had been set.
+        self.model_tag = None
+        self.version = None
+        self.res_dict = None
+        self.res_dir = None
+
+        if self.task == 'tts':
+            # For vocoder
+            self.voc_model_tag = None
+            self.voc_version = None
+            self.voc_res_dict = None
+            self.voc_res_dir = None
+
+    def set_task_model(self,
+                       model_tag: str,
+                       model_type: int=0,
+                       version: Optional[str]=None):
+        """Set model tag and version of current task.
+
+        Args:
+            model_tag (str): Model tag.
+            model_type (int): 0 for acoustic model otherwise vocoder in tts task.
+            version (Optional[str], optional): Version of pretrained model. Defaults to None.
+        """
+        assert model_tag in self.pretrained_models, \
+            "Can't find \"{}\" in resource. Model name must be one of {}".format(model_tag, list(self.pretrained_models.keys()))
+
+        if version is None:
+            version = self._get_default_version(model_tag)
+
+        assert version in self.pretrained_models[model_tag], \
+            "Can't find version \"{}\" in \"{}\". Model name must be one of {}".format(
+                version, model_tag, list(self.pretrained_models[model_tag].keys()))
+
+        if model_type == 0:
+            self.model_tag = model_tag
+            self.version = version
+            self.res_dict = self.pretrained_models[model_tag][version]
+            self.res_dir = self._fetch(self.res_dict,
+                                       self._get_model_dir(model_type))
+        else:
+            assert self.task == 'tts', 'Vocoder will only be used in tts task.'
+            self.voc_model_tag = model_tag
+            self.voc_version = version
+            self.voc_res_dict = self.pretrained_models[model_tag][version]
+            self.voc_res_dir = self._fetch(self.voc_res_dict,
+                                           self._get_model_dir(model_type))
+
+    @staticmethod
+    def get_model_class(model_name) -> List[object]:
+        """Dynamic import model class.
+        Args:
+            model_name (str): Model name.
+
+        Returns:
+            List[object]: Return a list of model class.
+        """
+        assert model_name in model_alias, 'No model classes found for "{}"'.format(
+            model_name)
+
+        ret = []
+        for import_path in model_alias[model_name]:
+            ret.append(dynamic_import(import_path))
+
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return ret
+
+    def get_versions(self, model_tag: str) -> List[str]:
+        """List all available versions.
+
+        Args:
+            model_tag (str): Model tag.
+
+        Returns:
+            List[str]: Version list of model.
+        """
+        return list(self.pretrained_models[model_tag].keys())
+
+    def _get_default_version(self, model_tag: str) -> str:
+        """Get default version of model.
+
+        Args:
+            model_tag (str): Model tag.
+
+        Returns:
+            str: Default version.
+        """
+        return self.get_versions(model_tag)[-1]  # get latest version
+
+    def _get_model_dir(self, model_type: int=0) -> os.PathLike:
+        """Get resource directory.
+
+        Args:
+            model_type (int): 0 for acoustic model otherwise vocoder in tts task.
+
+        Returns:
+            os.PathLike: Directory of model resource.
+        """
+        if model_type == 0:
+            model_tag = self.model_tag
+            version = self.version
+        else:
+            model_tag = self.voc_model_tag
+            version = self.voc_version
+
+        return os.path.join(MODEL_HOME, model_tag, version)
+
+    def _get_pretrained_models(self) -> Dict[str, str]:
+        """Get all available models for current task.
+
+        Returns:
+            Dict[str, str]: A dictionary with model tag and resources info.
+        """
+        try:
+            import_models = '{}_{}_pretrained_models'.format(self.task,
+                                                             self.model_format)
+            exec('from .pretrained_models import {}'.format(import_models))
+            models = OrderedDict(locals()[import_models])
+        except ImportError:
+            models = OrderedDict({})  # no models.
+        finally:
+            return models
+
+    def _inference_mode_filter(self, inference_mode: Optional[str]):
+        """Filter models dict based on inference_mode.
+
+        Args:
+            inference_mode (Optional[str]): 'online', 'offline' or None.
+        """
+        if inference_mode is None:
+            return
+
+        if self.task == 'asr':
+            online_flags = [
+                'online' in model_tag
+                for model_tag in self.pretrained_models.keys()
+            ]
+            for online_flag, model_tag in zip(
+                    online_flags, list(self.pretrained_models.keys())):
+                if inference_mode == 'online' and online_flag:
+                    continue
+                elif inference_mode == 'offline' and not online_flag:
+                    continue
+                else:
+                    del self.pretrained_models[model_tag]
+        elif self.task == 'tts':
+            # Hardcode for tts online models.
+            tts_online_models = [
+                'fastspeech2_csmsc-zh', 'fastspeech2_cnndecoder_csmsc-zh',
+                'mb_melgan_csmsc-zh', 'hifigan_csmsc-zh'
+            ]
+            for model_tag in list(self.pretrained_models.keys()):
+                if inference_mode == 'online' and model_tag in tts_online_models:
+                    continue
+                elif inference_mode == 'offline':
+                    continue
+                else:
+                    del self.pretrained_models[model_tag]
+        else:
+            raise NotImplementedError('Only supports asr and tts task.')
+
+    @staticmethod
+    def _fetch(res_dict: Dict[str, str],
+               target_dir: os.PathLike) -> os.PathLike:
+        """Fetch archive from url.
+
+        Args:
+            res_dict (Dict[str, str]): Info dict of a resource.
+            target_dir (os.PathLike): Directory to save archives.
+
+        Returns:
+            os.PathLike: Directory of model resource.
+        """
+        return download_and_decompress(res_dict, target_dir)
diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index e59f17d38..f1c6b4f89 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -25,6 +25,7 @@ from ..executor import BaseExecutor
 from ..util import cli_server_register
 from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
+from paddlespeech.resource import CommonTaskResource
 from paddlespeech.server.engine.engine_pool import init_engine_pool
 from paddlespeech.server.restful.api import setup_router as setup_http_router
 from paddlespeech.server.utils.config import get_config
@@ -152,101 +153,30 @@ class ServerStatsExecutor():
                 "Please input correct speech task, choices = ['asr', 'tts']")
             return False
 
-        elif self.task.lower() == 'asr':
-            try:
-                from paddlespeech.cli.asr.infer import pretrained_models
-                logger.info(
-                    "Here is the table of ASR pretrained models supported in the service."
-                )
-                self.show_support_models(pretrained_models)
-
-                # show ASR static pretrained model
-                from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models
-                logger.info(
-                    "Here is the table of ASR static pretrained models supported in the service."
-                )
-                self.show_support_models(pretrained_models)
-
-                return True
-            except BaseException:
-                logger.error(
-                    "Failed to get the table of ASR pretrained models supported in the service."
-                )
-                return False
-
-        elif self.task.lower() == 'tts':
-            try:
-                from paddlespeech.cli.tts.infer import pretrained_models
-                logger.info(
-                    "Here is the table of TTS pretrained models supported in the service."
-                )
-                self.show_support_models(pretrained_models)
-
-                # show TTS static pretrained model
-                from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models
-                logger.info(
-                    "Here is the table of TTS static pretrained models supported in the service."
-                )
-                self.show_support_models(pretrained_models)
-
-                return True
-            except BaseException:
-                logger.error(
-                    "Failed to get the table of TTS pretrained models supported in the service."
-                )
-                return False
+        try:
+            # Dynamic models
+            dynamic_pretrained_models = CommonTaskResource(
+                task=self.task, model_format='dynamic').pretrained_models
 
-        elif self.task.lower() == 'cls':
-            try:
-                from paddlespeech.cli.cls.infer import pretrained_models
+            if len(dynamic_pretrained_models) > 0:
                 logger.info(
-                    "Here is the table of CLS pretrained models supported in the service."
-                )
-                self.show_support_models(pretrained_models)
-
-                # show CLS static pretrained model
-                from paddlespeech.server.engine.cls.paddleinference.cls_engine import pretrained_models
+                    "Here is the table of {} pretrained models supported in the service.".
+                    format(self.task.upper()))
+                self.show_support_models(dynamic_pretrained_models)
+
+            # Static models
+            static_pretrained_models = CommonTaskResource(
+                task=self.task, model_format='static').pretrained_models
+            if len(static_pretrained_models) > 0:
                 logger.info(
-                    "Here is the table of CLS static pretrained models supported in the service."
-                )
+                    "Here is the table of {} static pretrained models supported in the service.".
+                    format(self.task.upper()))
                 self.show_support_models(pretrained_models)
 
-                return True
-            except BaseException:
-                logger.error(
-                    "Failed to get the table of CLS pretrained models supported in the service."
-                )
-                return False
-        elif self.task.lower() == 'text':
-            try:
-                from paddlespeech.cli.text.infer import pretrained_models
-                logger.info(
-                    "Here is the table of Text pretrained models supported in the service."
-                )
-                self.show_support_models(pretrained_models)
+            return True
 
-                return True
-            except BaseException:
-                logger.error(
-                    "Failed to get the table of Text pretrained models supported in the service."
-                )
-                return False
-        elif self.task.lower() == 'vector':
-            try:
-                from paddlespeech.cli.vector.infer import pretrained_models
-                logger.info(
-                    "Here is the table of Vector pretrained models supported in the service."
-                )
-                self.show_support_models(pretrained_models)
-
-                return True
-            except BaseException:
-                logger.error(
-                    "Failed to get the table of Vector pretrained models supported in the service."
-                )
-                return False
-        else:
+        except BaseException:
             logger.error(
-                f"Failed to get the table of {self.task} pretrained models supported in the service."
-            )
+                "Failed to get the table of {} pretrained models supported in the service.".
+                format(self.task.upper()))
             return False
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
index d7bd458f8..14715bf35 100644
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import copy
 import os
 import sys
 from typing import Optional
@@ -21,15 +20,14 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode
 
-from .pretrained_models import pretrained_models
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.resource import CommonTaskResource
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.speech import SpeechSegment
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.transform.transformation import Transformation
-from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.tensor_utils import add_sos_eos
 from paddlespeech.s2t.utils.tensor_utils import pad_sequence
 from paddlespeech.s2t.utils.utility import UpdateConfig
@@ -53,7 +51,7 @@ class PaddleASRConnectionHanddler:
         logger.info(
             "create an paddle asr connection handler to process the websocket connection"
         )
-        self.config = asr_engine.config # server config
+        self.config = asr_engine.config  # server config
         self.model_config = asr_engine.executor.config
         self.asr_engine = asr_engine
 
@@ -251,10 +249,12 @@ class PaddleASRConnectionHanddler:
             # for deepspeech2 
             # init state
             self.chunk_state_h_box = np.zeros(
-                (self.model_config .num_rnn_layers, 1, self.model_config.rnn_layer_size),
+                (self.model_config.num_rnn_layers, 1,
+                 self.model_config.rnn_layer_size),
                 dtype=float32)
             self.chunk_state_c_box = np.zeros(
-                (self.model_config.num_rnn_layers, 1, self.model_config.rnn_layer_size),
+                (self.model_config.num_rnn_layers, 1,
+                 self.model_config.rnn_layer_size),
                 dtype=float32)
             self.decoder.reset_decoder(batch_size=1)
 
@@ -699,7 +699,8 @@ class PaddleASRConnectionHanddler:
 class ASRServerExecutor(ASRExecutor):
     def __init__(self):
         super().__init__()
-        self.pretrained_models = pretrained_models
+        self.task_resource = CommonTaskResource(
+            task='asr', model_format='dynamic', inference_mode='online')
 
     def _init_from_path(self,
                         model_type: str=None,
@@ -723,20 +724,19 @@ class ASRServerExecutor(ASRExecutor):
         self.sample_rate = sample_rate
         sample_rate_str = '16k' if sample_rate == 16000 else '8k'
         tag = model_type + '-' + lang + '-' + sample_rate_str
-
+        self.task_resource.set_task_model(model_tag=tag)
         if cfg_path is None or am_model is None or am_params is None:
             logger.info(f"Load the pretrained model, tag = {tag}")
-            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
-            self.res_path = res_path
+            self.res_path = self.task_resource.res_dir
 
             self.cfg_path = os.path.join(
-                res_path, self.pretrained_models[tag]['cfg_path'])
+                self.res_path, self.task_resource.res_dict['cfg_path'])
 
-            self.am_model = os.path.join(res_path,
-                                         self.pretrained_models[tag]['model'])
-            self.am_params = os.path.join(res_path,
-                                          self.pretrained_models[tag]['params'])
-            logger.info(res_path)
+            self.am_model = os.path.join(self.res_path,
+                                         self.task_resource.res_dict['model'])
+            self.am_params = os.path.join(self.res_path,
+                                          self.task_resource.res_dict['params'])
+            logger.info(self.res_path)
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.am_model = os.path.abspath(am_model)
@@ -763,8 +763,8 @@ class ASRServerExecutor(ASRExecutor):
                 self.text_feature = TextFeaturizer(
                     unit_type=self.config.unit_type, vocab=self.vocab)
 
-                lm_url = self.pretrained_models[tag]['lm_url']
-                lm_md5 = self.pretrained_models[tag]['lm_md5']
+                lm_url = self.task_resource.res_dict['lm_url']
+                lm_md5 = self.task_resource.res_dict['lm_md5']
                 logger.info(f"Start to load language model {lm_url}")
                 self.download_lm(
                     lm_url,
@@ -810,7 +810,7 @@ class ASRServerExecutor(ASRExecutor):
             model_name = model_type[:model_type.rindex(
                 '_')]  # model_type: {model_name}_{dataset}
             logger.info(f"model name: {model_name}")
-            model_class = dynamic_import(model_name, self.model_alias)
+            model_class = self.task_resource.get_model_class(model_name)
             model_conf = self.config
             model = model_class.from_config(model_conf)
             self.model = model
@@ -824,7 +824,7 @@ class ASRServerExecutor(ASRExecutor):
             raise ValueError(f"Not support: {model_type}")
 
         return True
-        
+
 
 class ASREngine(BaseEngine):
     """ASR server resource
diff --git a/paddlespeech/server/engine/asr/online/pretrained_models.py b/paddlespeech/server/engine/asr/online/pretrained_models.py
deleted file mode 100644
index ff3778657..000000000
--- a/paddlespeech/server/engine/asr/online/pretrained_models.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
-    "deepspeech2online_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
-        'md5':
-        '98b87b171b7240b7cae6e07d8d0bc9be',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2_online/checkpoints/avg_1',
-        'model':
-        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
-        'params':
-        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "conformer_online_multicn-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
-        'md5':
-        '0ac93d390552336f2a906aec9e33c5fa',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/chunk_conformer/checkpoints/multi_cn',
-        'model':
-        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
-        'params':
-        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "conformer_online_wenetspeech-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz',
-        'md5':
-        'b8c02632b04da34aca88459835be54a6',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/chunk_conformer/checkpoints/avg_10',
-        'model':
-        'exp/chunk_conformer/checkpoints/avg_10.pdparams',
-        'params':
-        'exp/chunk_conformer/checkpoints/avg_10.pdparams',
-        'lm_url':
-        '',
-        'lm_md5':
-        '',
-    },
-}
diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
index e275f1088..80e323fa0 100644
--- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@@ -19,10 +19,10 @@ from typing import Optional
 import paddle
 from yacs.config import CfgNode
 
-from .pretrained_models import pretrained_models
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.resource import CommonTaskResource
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.utils.utility import UpdateConfig
@@ -36,7 +36,8 @@ __all__ = ['ASREngine']
 class ASRServerExecutor(ASRExecutor):
     def __init__(self):
         super().__init__()
-        self.pretrained_models = pretrained_models
+        self.task_resource = CommonTaskResource(
+            task='asr', model_format='static', inference_mode='online')
 
     def _init_from_path(self,
                         model_type: str='wenetspeech',
@@ -53,17 +54,17 @@ class ASRServerExecutor(ASRExecutor):
 
         sample_rate_str = '16k' if sample_rate == 16000 else '8k'
         tag = model_type + '-' + lang + '-' + sample_rate_str
+        self.task_resource.set_task_model(model_tag=tag)
         if cfg_path is None or am_model is None or am_params is None:
-            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
-            self.res_path = res_path
+            self.res_path = self.task_resource.res_dir
             self.cfg_path = os.path.join(
-                res_path, self.pretrained_models[tag]['cfg_path'])
+                self.res_path, self.task_resource.res_dict['cfg_path'])
 
-            self.am_model = os.path.join(res_path,
-                                         self.pretrained_models[tag]['model'])
-            self.am_params = os.path.join(res_path,
-                                          self.pretrained_models[tag]['params'])
-            logger.info(res_path)
+            self.am_model = os.path.join(self.res_path,
+                                         self.task_resource.res_dict['model'])
+            self.am_params = os.path.join(self.res_path,
+                                          self.task_resource.res_dict['params'])
+            logger.info(self.res_path)
             logger.info(self.cfg_path)
             logger.info(self.am_model)
             logger.info(self.am_params)
@@ -89,8 +90,8 @@ class ASRServerExecutor(ASRExecutor):
                 self.text_feature = TextFeaturizer(
                     unit_type=self.config.unit_type, vocab=self.vocab)
 
-                lm_url = self.pretrained_models[tag]['lm_url']
-                lm_md5 = self.pretrained_models[tag]['lm_md5']
+                lm_url = self.task_resource.res_dict['lm_url']
+                lm_md5 = self.task_resource.res_dict['lm_md5']
                 self.download_lm(
                     lm_url,
                     os.path.dirname(self.config.decode.lang_model_path), lm_md5)
diff --git a/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
deleted file mode 100644
index c4c23e38c..000000000
--- a/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
-    "deepspeech2offline_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
-        'md5':
-        '932c3593d62fe5c741b59b31318aa314',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2/checkpoints/avg_1',
-        'model':
-        'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
-        'params':
-        'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-}
diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
index 0906c2412..48792c883 100644
--- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
+++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
@@ -20,9 +20,9 @@ import numpy as np
 import paddle
 import yaml
 
-from .pretrained_models import pretrained_models
 from paddlespeech.cli.cls.infer import CLSExecutor
 from paddlespeech.cli.log import logger
+from paddlespeech.resource import CommonTaskResource
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.paddle_predictor import init_predictor
 from paddlespeech.server.utils.paddle_predictor import run_model
@@ -33,11 +33,12 @@ __all__ = ['CLSEngine']
 class CLSServerExecutor(CLSExecutor):
     def __init__(self):
         super().__init__()
-        self.pretrained_models = pretrained_models
+        self.task_resource = CommonTaskResource(
+            task='cls', model_format='static')
 
     def _init_from_path(
             self,
-            model_type: str='panns_cnn14',
+            model_type: str='panns_cnn14_audioset',
             cfg_path: Optional[os.PathLike]=None,
             model_path: Optional[os.PathLike]=None,
             params_path: Optional[os.PathLike]=None,
@@ -49,15 +50,16 @@ class CLSServerExecutor(CLSExecutor):
 
         if cfg_path is None or model_path is None or params_path is None or label_file is None:
             tag = model_type + '-' + '32k'
-            self.res_path = self._get_pretrained_path(tag)
+            self.task_resource.set_task_model(model_tag=tag)
+            self.res_path = self.task_resource.res_dir
             self.cfg_path = os.path.join(
-                self.res_path, self.pretrained_models[tag]['cfg_path'])
+                self.res_path, self.task_resource.res_dict['cfg_path'])
             self.model_path = os.path.join(
-                self.res_path, self.pretrained_models[tag]['model_path'])
+                self.res_path, self.task_resource.res_dict['model_path'])
             self.params_path = os.path.join(
-                self.res_path, self.pretrained_models[tag]['params_path'])
+                self.res_path, self.task_resource.res_dict['params_path'])
             self.label_file = os.path.join(
-                self.res_path, self.pretrained_models[tag]['label_file'])
+                self.res_path, self.task_resource.res_dict['label_file'])
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.model_path = os.path.abspath(model_path)
diff --git a/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
deleted file mode 100644
index e49148746..000000000
--- a/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-pretrained_models = {
-    "panns_cnn6-32k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
-        'md5':
-        'da087c31046d23281d8ec5188c1967da',
-        'cfg_path':
-        'panns.yaml',
-        'model_path':
-        'inference.pdmodel',
-        'params_path':
-        'inference.pdiparams',
-        'label_file':
-        'audioset_labels.txt',
-    },
-    "panns_cnn10-32k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
-        'md5':
-        '5460cc6eafbfaf0f261cc75b90284ae1',
-        'cfg_path':
-        'panns.yaml',
-        'model_path':
-        'inference.pdmodel',
-        'params_path':
-        'inference.pdiparams',
-        'label_file':
-        'audioset_labels.txt',
-    },
-    "panns_cnn14-32k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
-        'md5':
-        'ccc80b194821274da79466862b2ab00f',
-        'cfg_path':
-        'panns.yaml',
-        'model_path':
-        'inference.pdmodel',
-        'params_path':
-        'inference.pdiparams',
-        'label_file':
-        'audioset_labels.txt',
-    },
-}
diff --git a/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
deleted file mode 100644
index 789f5be7d..000000000
--- a/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# support online model
-pretrained_models = {
-    # fastspeech2
-    "fastspeech2_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
-        'md5':
-        'fd3ad38d83273ad51f0ea4f4abf3ab4e',
-        'ckpt': ['fastspeech2_csmsc.onnx'],
-        'phones_dict':
-        'phone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    "fastspeech2_cnndecoder_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
-        'md5':
-        '5f70e1a6bcd29d72d54e7931aa86f266',
-        'ckpt': [
-            'fastspeech2_csmsc_am_encoder_infer.onnx',
-            'fastspeech2_csmsc_am_decoder.onnx',
-            'fastspeech2_csmsc_am_postnet.onnx',
-        ],
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-
-    # mb_melgan
-    "mb_melgan_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
-        'md5':
-        '5b83ec746e8414bc29032d954ffd07ec',
-        'ckpt':
-        'mb_melgan_csmsc.onnx',
-        'sample_rate':
-        24000,
-    },
-
-    # hifigan
-    "hifigan_csmsc_onnx-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
-        'md5':
-        '1a7dc0385875889e46952e50c0994a6b',
-        'ckpt':
-        'hifigan_csmsc.onnx',
-        'sample_rate':
-        24000,
-    },
-}
diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
index 792442065..6453f1ae7 100644
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -20,9 +20,9 @@ from typing import Optional
 import numpy as np
 import paddle
 
-from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.resource import CommonTaskResource
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.onnx_infer import get_sess
@@ -43,7 +43,7 @@ class TTSServerExecutor(TTSExecutor):
         self.voc_pad = voc_pad
         self.voc_upsample = voc_upsample
 
-        self.pretrained_models = pretrained_models
+        self.task_resource = CommonTaskResource(task='tts', model_format='onnx')
 
     def _init_from_path(
             self,
@@ -72,16 +72,21 @@ class TTSServerExecutor(TTSExecutor):
             return
         # am
         am_tag = am + '-' + lang
+        self.task_resource.set_task_model(
+            model_tag=am_tag,
+            model_type=0,  # am
+            version=None,  # default version
+        )
+        self.am_res_path = self.task_resource.res_dir
         if am == "fastspeech2_csmsc_onnx":
             # get model info
             if am_ckpt is None or phones_dict is None:
-                am_res_path = self._get_pretrained_path(am_tag)
-                self.am_res_path = am_res_path
                 self.am_ckpt = os.path.join(
-                    am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
+                    self.am_res_path, self.task_resource.res_dict['ckpt'][0])
                 # must have phones_dict in acoustic
                 self.phones_dict = os.path.join(
-                    am_res_path, self.pretrained_models[am_tag]['phones_dict'])
+                    self.am_res_path,
+                    self.task_resource.res_dict['phones_dict'])
 
             else:
                 self.am_ckpt = os.path.abspath(am_ckpt[0])
@@ -94,19 +99,19 @@ class TTSServerExecutor(TTSExecutor):
 
         elif am == "fastspeech2_cnndecoder_csmsc_onnx":
             if am_ckpt is None or am_stat is None or phones_dict is None:
-                am_res_path = self._get_pretrained_path(am_tag)
-                self.am_res_path = am_res_path
                 self.am_encoder_infer = os.path.join(
-                    am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
+                    self.am_res_path, self.task_resource.res_dict['ckpt'][0])
                 self.am_decoder = os.path.join(
-                    am_res_path, self.pretrained_models[am_tag]['ckpt'][1])
+                    self.am_res_path, self.task_resource.res_dict['ckpt'][1])
                 self.am_postnet = os.path.join(
-                    am_res_path, self.pretrained_models[am_tag]['ckpt'][2])
+                    self.am_res_path, self.task_resource.res_dict['ckpt'][2])
                 # must have phones_dict in acoustic
                 self.phones_dict = os.path.join(
-                    am_res_path, self.pretrained_models[am_tag]['phones_dict'])
+                    self.am_res_path,
+                    self.task_resource.res_dict['phones_dict'])
                 self.am_stat = os.path.join(
-                    am_res_path, self.pretrained_models[am_tag]['speech_stats'])
+                    self.am_res_path,
+                    self.task_resource.res_dict['speech_stats'])
 
             else:
                 self.am_encoder_infer = os.path.abspath(am_ckpt[0])
@@ -131,11 +136,15 @@ class TTSServerExecutor(TTSExecutor):
 
         # voc model info
         voc_tag = voc + '-' + lang
+        self.task_resource.set_task_model(
+            model_tag=voc_tag,
+            model_type=1,  # vocoder
+            version=None,  # default version
+        )
         if voc_ckpt is None:
-            voc_res_path = self._get_pretrained_path(voc_tag)
-            self.voc_res_path = voc_res_path
+            self.voc_res_path = self.task_resource.voc_res_dir
             self.voc_ckpt = os.path.join(
-                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
+                self.voc_res_path, self.task_resource.voc_res_dict['ckpt'])
         else:
             self.voc_ckpt = os.path.abspath(voc_ckpt)
             self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt))
diff --git a/paddlespeech/server/engine/tts/online/python/pretrained_models.py b/paddlespeech/server/engine/tts/online/python/pretrained_models.py
deleted file mode 100644
index bf6aded51..000000000
--- a/paddlespeech/server/engine/tts/online/python/pretrained_models.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# support online model
-pretrained_models = {
-    # fastspeech2
-    "fastspeech2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
-        'md5':
-        '637d28a5e53aa60275612ba4393d5f22',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_76000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-    "fastspeech2_cnndecoder_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
-        'md5':
-        '6eb28e22ace73e0ebe7845f86478f89f',
-        'config':
-        'cnndecoder.yaml',
-        'ckpt':
-        'snapshot_iter_153000.pdz',
-        'speech_stats':
-        'speech_stats.npy',
-        'phones_dict':
-        'phone_id_map.txt',
-    },
-
-    # mb_melgan
-    "mb_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'ee5f0604e20091f0d495b6ec4618b90d',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_1000000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-
-    # hifigan
-    "hifigan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
-        'md5':
-        'dd40a3d88dfcf64513fba2f0f961ada6',
-        'config':
-        'default.yaml',
-        'ckpt':
-        'snapshot_iter_2500000.pdz',
-        'speech_stats':
-        'feats_stats.npy',
-    },
-}
diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py
index 8dc36f8ef..2c08521de 100644
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -22,9 +22,9 @@ import paddle
 import yaml
 from yacs.config import CfgNode
 
-from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.resource import CommonTaskResource
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.util import denorm
@@ -32,7 +32,6 @@ from paddlespeech.server.utils.util import get_chunks
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
-from paddlespeech.utils.dynamic_import import dynamic_import
 
 __all__ = ['TTSEngine']
 
@@ -44,7 +43,8 @@ class TTSServerExecutor(TTSExecutor):
         self.am_pad = am_pad
         self.voc_block = voc_block
         self.voc_pad = voc_pad
-        self.pretrained_models = pretrained_models
+        self.task_resource = CommonTaskResource(
+            task='tts', model_format='static', inference_mode='online')
 
     def get_model_info(self,
                        field: str,
@@ -65,7 +65,7 @@ class TTSServerExecutor(TTSExecutor):
             [Tensor]: standard deviation
         """
 
-        model_class = dynamic_import(model_name, self.model_alias)
+        model_class = self.task_resource.get_model_class(model_name)
 
         if field == "am":
             odim = self.am_config.n_mels
@@ -110,20 +110,24 @@ class TTSServerExecutor(TTSExecutor):
             return
         # am model info
         am_tag = am + '-' + lang
+        self.task_resource.set_task_model(
+            model_tag=am_tag,
+            model_type=0,  # am
+            version=None,  # default version
+        )
         if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
-            am_res_path = self._get_pretrained_path(am_tag)
-            self.am_res_path = am_res_path
-            self.am_config = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['config'])
-            self.am_ckpt = os.path.join(am_res_path,
-                                        self.pretrained_models[am_tag]['ckpt'])
+            self.am_res_path = self.task_resource.res_dir
+            self.am_config = os.path.join(self.am_res_path,
+                                          self.task_resource.res_dict['config'])
+            self.am_ckpt = os.path.join(self.am_res_path,
+                                        self.task_resource.res_dict['ckpt'])
             self.am_stat = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['speech_stats'])
+                self.am_res_path, self.task_resource.res_dict['speech_stats'])
             # must have phones_dict in acoustic
             self.phones_dict = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
+                self.am_res_path, self.task_resource.res_dict['phones_dict'])
             print("self.phones_dict:", self.phones_dict)
-            logger.info(am_res_path)
+            logger.info(self.am_res_path)
             logger.info(self.am_config)
             logger.info(self.am_ckpt)
         else:
@@ -139,16 +143,21 @@ class TTSServerExecutor(TTSExecutor):
 
         # voc model info
         voc_tag = voc + '-' + lang
+        self.task_resource.set_task_model(
+            model_tag=voc_tag,
+            model_type=1,  # vocoder
+            version=None,  # default version
+        )
         if voc_ckpt is None or voc_config is None or voc_stat is None:
-            voc_res_path = self._get_pretrained_path(voc_tag)
-            self.voc_res_path = voc_res_path
+            self.voc_res_path = self.task_resource.voc_res_dir
             self.voc_config = os.path.join(
-                voc_res_path, self.pretrained_models[voc_tag]['config'])
+                self.voc_res_path, self.task_resource.voc_res_dict['config'])
             self.voc_ckpt = os.path.join(
-                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
+                self.voc_res_path, self.task_resource.voc_res_dict['ckpt'])
             self.voc_stat = os.path.join(
-                voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
-            logger.info(voc_res_path)
+                self.voc_res_path,
+                self.task_resource.voc_res_dict['speech_stats'])
+            logger.info(self.voc_res_path)
             logger.info(self.voc_config)
             logger.info(self.voc_ckpt)
         else:
@@ -188,8 +197,8 @@ class TTSServerExecutor(TTSExecutor):
             am, am_mu, am_std = self.get_model_info("am", self.am_name,
                                                     self.am_ckpt, self.am_stat)
             am_normalizer = ZScore(am_mu, am_std)
-            am_inference_class = dynamic_import(self.am_name + '_inference',
-                                                self.model_alias)
+            am_inference_class = self.task_resource.get_model_class(
+                self.am_name + '_inference')
             self.am_inference = am_inference_class(am_normalizer, am)
             self.am_inference.eval()
         print("acoustic model done!")
@@ -199,8 +208,8 @@ class TTSServerExecutor(TTSExecutor):
         voc, voc_mu, voc_std = self.get_model_info("voc", self.voc_name,
                                                    self.voc_ckpt, self.voc_stat)
         voc_normalizer = ZScore(voc_mu, voc_std)
-        voc_inference_class = dynamic_import(self.voc_name + '_inference',
-                                             self.model_alias)
+        voc_inference_class = self.task_resource.get_model_class(self.voc_name +
+                                                                 '_inference')
         self.voc_inference = voc_inference_class(voc_normalizer, voc)
         self.voc_inference.eval()
         print("voc done!")
@@ -505,4 +514,4 @@ class TTSEngine(BaseEngine):
         logger.info(f"RTF: {self.executor.final_response_time / duration}")
         logger.info(
             f"Other info: front time: {self.executor.frontend_time} s, first am infer time: {self.executor.first_am_infer} s, first voc infer time: {self.executor.first_voc_infer} s,"
-        )
\ No newline at end of file
+        )
diff --git a/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
deleted file mode 100644
index 9618a7a69..000000000
--- a/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Static model applied on paddle inference
-pretrained_models = {
-    # speedyspeech
-    "speedyspeech_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
-        'md5':
-        'f10cbdedf47dc7a9668d2264494e1823',
-        'model':
-        'speedyspeech_csmsc.pdmodel',
-        'params':
-        'speedyspeech_csmsc.pdiparams',
-        'phones_dict':
-        'phone_id_map.txt',
-        'tones_dict':
-        'tone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    # fastspeech2
-    "fastspeech2_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
-        'md5':
-        '9788cd9745e14c7a5d12d32670b2a5a7',
-        'model':
-        'fastspeech2_csmsc.pdmodel',
-        'params':
-        'fastspeech2_csmsc.pdiparams',
-        'phones_dict':
-        'phone_id_map.txt',
-        'sample_rate':
-        24000,
-    },
-    # pwgan
-    "pwgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
-        'md5':
-        'e3504aed9c5a290be12d1347836d2742',
-        'model':
-        'pwgan_csmsc.pdmodel',
-        'params':
-        'pwgan_csmsc.pdiparams',
-        'sample_rate':
-        24000,
-    },
-    # mb_melgan
-    "mb_melgan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
-        'md5':
-        'ac6eee94ba483421d750433f4c3b8d36',
-        'model':
-        'mb_melgan_csmsc.pdmodel',
-        'params':
-        'mb_melgan_csmsc.pdiparams',
-        'sample_rate':
-        24000,
-    },
-    # hifigan
-    "hifigan_csmsc-zh": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
-        'md5':
-        '7edd8c436b3a5546b3a7cb8cff9d5a0c',
-        'model':
-        'hifigan_csmsc.pdmodel',
-        'params':
-        'hifigan_csmsc.pdiparams',
-        'sample_rate':
-        24000,
-    },
-}
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index f1ce8b76e..44e564983 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -23,9 +23,9 @@ import paddle
 import soundfile as sf
 from scipy.io import wavfile
 
-from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.resource import CommonTaskResource
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import change_speed
 from paddlespeech.server.utils.errors import ErrorCode
@@ -41,7 +41,8 @@ __all__ = ['TTSEngine']
 class TTSServerExecutor(TTSExecutor):
     def __init__(self):
         super().__init__()
-        self.pretrained_models = pretrained_models
+        self.task_resource = CommonTaskResource(
+            task='tts', model_format='static')
 
     def _init_from_path(
             self,
@@ -67,19 +68,23 @@ class TTSServerExecutor(TTSExecutor):
             return
         # am
         am_tag = am + '-' + lang
+        self.task_resource.set_task_model(
+            model_tag=am_tag,
+            model_type=0,  # am
+            version=None,  # default version
+        )
         if am_model is None or am_params is None or phones_dict is None:
-            am_res_path = self._get_pretrained_path(am_tag)
-            self.am_res_path = am_res_path
-            self.am_model = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['model'])
-            self.am_params = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['params'])
+            self.am_res_path = self.task_resource.res_dir
+            self.am_model = os.path.join(self.am_res_path,
+                                         self.task_resource.res_dict['model'])
+            self.am_params = os.path.join(self.am_res_path,
+                                          self.task_resource.res_dict['params'])
             # must have phones_dict in acoustic
             self.phones_dict = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
-            self.am_sample_rate = self.pretrained_models[am_tag]['sample_rate']
+                self.am_res_path, self.task_resource.res_dict['phones_dict'])
+            self.am_sample_rate = self.task_resource.res_dict['sample_rate']
 
-            logger.info(am_res_path)
+            logger.info(self.am_res_path)
             logger.info(self.am_model)
             logger.info(self.am_params)
         else:
@@ -92,32 +97,36 @@ class TTSServerExecutor(TTSExecutor):
 
         # for speedyspeech
         self.tones_dict = None
-        if 'tones_dict' in self.pretrained_models[am_tag]:
+        if 'tones_dict' in self.task_resource.res_dict:
             self.tones_dict = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['tones_dict'])
+                self.am_res_path, self.task_resource.res_dict['tones_dict'])
             if tones_dict:
                 self.tones_dict = tones_dict
 
         # for multi speaker fastspeech2
         self.speaker_dict = None
-        if 'speaker_dict' in self.pretrained_models[am_tag]:
+        if 'speaker_dict' in self.task_resource.res_dict:
             self.speaker_dict = os.path.join(
-                am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
+                self.am_res_path, self.task_resource.res_dict['speaker_dict'])
             if speaker_dict:
                 self.speaker_dict = speaker_dict
 
         # voc
         voc_tag = voc + '-' + lang
+        self.task_resource.set_task_model(
+            model_tag=voc_tag,
+            model_type=1,  # vocoder
+            version=None,  # default version
+        )
         if voc_model is None or voc_params is None:
-            voc_res_path = self._get_pretrained_path(voc_tag)
-            self.voc_res_path = voc_res_path
+            self.voc_res_path = self.task_resource.voc_res_dir
             self.voc_model = os.path.join(
-                voc_res_path, self.pretrained_models[voc_tag]['model'])
+                self.voc_res_path, self.task_resource.voc_res_dict['model'])
             self.voc_params = os.path.join(
-                voc_res_path, self.pretrained_models[voc_tag]['params'])
-            self.voc_sample_rate = self.pretrained_models[voc_tag][
+                self.voc_res_path, self.task_resource.voc_res_dict['params'])
+            self.voc_sample_rate = self.task_resource.voc_res_dict[
                 'sample_rate']
-            logger.info(voc_res_path)
+            logger.info(self.voc_res_path)
             logger.info(self.voc_model)
             logger.info(self.voc_params)
         else:

From 7766d7344d16dc1d200151a2f7d23811c08db0a9 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 26 May 2022 14:39:01 +0800
Subject: [PATCH 30/40] Add paddlespeech.resource.

---
 paddlespeech/resource/model_alias.py | 87 ++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 paddlespeech/resource/model_alias.py

diff --git a/paddlespeech/resource/model_alias.py b/paddlespeech/resource/model_alias.py
new file mode 100644
index 000000000..2b19ed065
--- /dev/null
+++ b/paddlespeech/resource/model_alias.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'model_alias',
+]
+
+# Records of model name to import class
+model_alias = {
+    # ---------------------------------
+    # -------------- ASR --------------
+    # ---------------------------------
+    "deepspeech2offline": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
+    "deepspeech2online":
+    ["paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline"],
+    "conformer": ["paddlespeech.s2t.models.u2:U2Model"],
+    "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"],
+    "transformer": ["paddlespeech.s2t.models.u2:U2Model"],
+    "wenetspeech": ["paddlespeech.s2t.models.u2:U2Model"],
+
+    # ---------------------------------
+    # -------------- CLS --------------
+    # ---------------------------------
+    "panns_cnn6": ["paddlespeech.cls.models.panns:CNN6"],
+    "panns_cnn10": ["paddlespeech.cls.models.panns:CNN10"],
+    "panns_cnn14": ["paddlespeech.cls.models.panns:CNN14"],
+
+    # ---------------------------------
+    # -------------- ST ---------------
+    # ---------------------------------
+    "fat_st": ["paddlespeech.s2t.models.u2_st:U2STModel"],
+
+    # ---------------------------------
+    # -------------- TEXT -------------
+    # ---------------------------------
+    "ernie_linear_p7": [
+        "paddlespeech.text.models:ErnieLinear",
+        "paddlenlp.transformers:ErnieTokenizer"
+    ],
+    "ernie_linear_p3": [
+        "paddlespeech.text.models:ErnieLinear",
+        "paddlenlp.transformers:ErnieTokenizer"
+    ],
+
+    # ---------------------------------
+    # -------------- TTS --------------
+    # ---------------------------------
+    # acoustic model
+    "speedyspeech": ["paddlespeech.t2s.models.speedyspeech:SpeedySpeech"],
+    "speedyspeech_inference":
+    ["paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference"],
+    "fastspeech2": ["paddlespeech.t2s.models.fastspeech2:FastSpeech2"],
+    "fastspeech2_inference":
+    ["paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference"],
+    "tacotron2": ["paddlespeech.t2s.models.tacotron2:Tacotron2"],
+    "tacotron2_inference":
+    ["paddlespeech.t2s.models.tacotron2:Tacotron2Inference"],
+    # voc
+    "pwgan": ["paddlespeech.t2s.models.parallel_wavegan:PWGGenerator"],
+    "pwgan_inference":
+    ["paddlespeech.t2s.models.parallel_wavegan:PWGInference"],
+    "mb_melgan": ["paddlespeech.t2s.models.melgan:MelGANGenerator"],
+    "mb_melgan_inference": ["paddlespeech.t2s.models.melgan:MelGANInference"],
+    "style_melgan": ["paddlespeech.t2s.models.melgan:StyleMelGANGenerator"],
+    "style_melgan_inference":
+    ["paddlespeech.t2s.models.melgan:StyleMelGANInference"],
+    "hifigan": ["paddlespeech.t2s.models.hifigan:HiFiGANGenerator"],
+    "hifigan_inference": ["paddlespeech.t2s.models.hifigan:HiFiGANInference"],
+    "wavernn": ["paddlespeech.t2s.models.wavernn:WaveRNN"],
+    "wavernn_inference": ["paddlespeech.t2s.models.wavernn:WaveRNNInference"],
+
+    # ---------------------------------
+    # ------------ Vector -------------
+    # ---------------------------------
+    "ecapatdnn": ["paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn"],
+}

From dcea088c660b84d6266af436d8aff604382a1f56 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Mon, 30 May 2022 14:57:20 +0800
Subject: [PATCH 31/40] Add paddlespeech.resource.

---
 paddlespeech/cli/asr/infer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 842acf5ce..92f9b0e41 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -167,8 +167,8 @@ class ASRExecutor(BaseExecutor):
                 self.collate_fn_test = SpeechCollator.from_config(self.config)
                 self.text_feature = TextFeaturizer(
                     unit_type=self.config.unit_type, vocab=self.vocab)
-                lm_url = self.resource.res_dict['lm_url']
-                lm_md5 = self.resource.res_dict['lm_md5']
+                lm_url = self.task_resource.res_dict['lm_url']
+                lm_md5 = self.task_resource.res_dict['lm_md5']
                 self.download_lm(
                     lm_url,
                     os.path.dirname(self.config.decode.lang_model_path), lm_md5)

From 1e066fab9ecba98613d8f8076d9d0726834e386a Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Mon, 30 May 2022 16:22:32 +0800
Subject: [PATCH 32/40] Add paddlespeech.resource.

---
 paddlespeech/cli/vector/infer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 8bf090013..56f86f9b8 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -297,8 +297,7 @@ class VectorExecutor(BaseExecutor):
             logger.info(f"load the pretrained model: {tag}")
             # get the model from the pretrained list
             # we download the pretrained model and store it in the res_path
-            res_path = self._get_pretrained_path(tag)
-            self.res_path = res_path
+            self.res_path = self.task_resource.res_dir
 
             self.cfg_path = os.path.join(
                 self.task_resource.res_dir,

From 6a082215251f018a4050a1dae97f1a543668551f Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Mon, 30 May 2022 19:37:08 +0800
Subject: [PATCH 33/40] Add paddlespeech.resource.

---
 paddlespeech/resource/pretrained_models.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index 9441a2805..84362f967 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -57,6 +57,14 @@ asr_dynamic_pretrained_models = {
             'model.yaml',
             'ckpt_path':
             'exp/chunk_conformer/checkpoints/avg_10',
+            'model':
+            'exp/chunk_conformer/checkpoints/avg_10.pdparams',
+            'params':
+            'exp/chunk_conformer/checkpoints/avg_10.pdparams',
+            'lm_url':
+            '',
+            'lm_md5':
+            '',
         },
     },
     "conformer_online_multicn-zh-16k": {
@@ -79,10 +87,14 @@ asr_dynamic_pretrained_models = {
             'model.yaml',
             'ckpt_path':
             'exp/chunk_conformer/checkpoints/multi_cn',
+            'model':
+            'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
+            'params':
+            'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
             'lm_url':
             'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
             'lm_md5':
-            '29e02312deb2e59b3c8686c7966d4fe3'
+            '29e02312deb2e59b3c8686c7966d4fe3',
         },
     },
     "conformer_aishell-zh-16k": {
@@ -163,6 +175,10 @@ asr_dynamic_pretrained_models = {
             'model.yaml',
             'ckpt_path':
             'exp/deepspeech2_online/checkpoints/avg_1',
+            'model':
+            'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+            'params':
+            'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
             'lm_url':
             'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
             'lm_md5':

From 9a253bc0918391b9fb1daedc615095fe00d1af37 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 30 May 2022 11:58:37 +0000
Subject: [PATCH 34/40] gen lexicon with tone in mfa, test=tts

---
 examples/other/mfa/local/reorganize_baker.py | 3 ---
 examples/other/mfa/run.sh                    | 6 ++----
 2 files changed, 2 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 examples/other/mfa/run.sh

diff --git a/examples/other/mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py
index 8adad834f..153e01d13 100644
--- a/examples/other/mfa/local/reorganize_baker.py
+++ b/examples/other/mfa/local/reorganize_baker.py
@@ -42,9 +42,6 @@ def get_transcripts(path: Union[str, Path]):
     for i in range(0, len(lines), 2):
         sentence_id = lines[i].split()[0]
         transcription = lines[i + 1].strip()
-        # tones are dropped here
-        # since the lexicon does not consider tones, too
-        transcription = " ".join([item[:-1] for item in transcription.split()])
         transcripts[sentence_id] = transcription
 
     return transcripts
diff --git a/examples/other/mfa/run.sh b/examples/other/mfa/run.sh
old mode 100644
new mode 100755
index 1fef58b4e..29dacc9b1
--- a/examples/other/mfa/run.sh
+++ b/examples/other/mfa/run.sh
@@ -4,7 +4,7 @@ mkdir -p $EXP_DIR
 LEXICON_NAME='simple'
 if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
     echo "generating lexicon..."
-    python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r
+    python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r --with-tone
     echo "lexicon done"
 fi
 
@@ -16,6 +16,7 @@ if [ ! -d $EXP_DIR/baker_corpus ]; then
     echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus "
 fi
 
+
 echo "detecting oov..."
 python local/detect_oov.py $EXP_DIR/baker_corpus $EXP_DIR/"$LEXICON_NAME.lexicon"
 echo "detecting oov done. you may consider regenerate lexicon if there is unexpected OOVs."
@@ -44,6 +45,3 @@ if [ ! -d "$EXP_DIR/baker_alignment" ]; then
     echo "model: $EXP_DIR/baker_model"
 fi
 
-
-
-

From 00a185b147791b4803ac3f3c0034a729f6a00318 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Mon, 30 May 2022 21:44:12 +0800
Subject: [PATCH 35/40] add dispenser in frontend

---
 speechx/speechx/decoder/param.h               |  4 +-
 speechx/speechx/frontend/audio/CMakeLists.txt |  3 +-
 .../frontend/audio/compute_fbank_main.cc      |  4 --
 .../audio/compute_linear_spectrogram_main.cc  |  4 --
 speechx/speechx/frontend/audio/dispenser.cc   | 72 +++++++++++++++++++
 speechx/speechx/frontend/audio/dispenser.h    | 67 +++++++++++++++++
 .../speechx/frontend/audio/feature_cache.cc   | 30 ++------
 .../speechx/frontend/audio/feature_cache.h    |  6 +-
 .../frontend/audio/feature_pipeline.cc        |  5 +-
 .../speechx/frontend/audio/feature_pipeline.h |  6 +-
 10 files changed, 157 insertions(+), 44 deletions(-)
 create mode 100644 speechx/speechx/frontend/audio/dispenser.cc
 create mode 100644 speechx/speechx/frontend/audio/dispenser.h

diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index b2bf1890a..ae7595682 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -81,8 +81,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
         frame_opts.preemph_coeff = 0.0;
         opts.linear_spectrogram_opts.frame_opts = frame_opts;
     }
-    opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length;
-    opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate;
+    opts.dispenser_opts.frame_chunk_size = FLAGS_receptive_field_length;
+    opts.dispenser_opts.frame_chunk_stride = FLAGS_downsampling_rate;
     return opts;
 }
 
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index 0aec68faf..ee7c05c4c 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -8,6 +8,7 @@ add_library(frontend STATIC
   feature_cache.cc
   feature_pipeline.cc
   fbank.cc
+  dispenser.cc
 )
 target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)
 
@@ -27,4 +28,4 @@ foreach(bin_name IN LISTS BINS)
   add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
   target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
   target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog)
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc
index 67683eebf..18024719b 100644
--- a/speechx/speechx/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc
@@ -64,10 +64,6 @@ int main(int argc, char* argv[]) {
 
     ppspeech::FeatureCacheOptions feat_cache_opts;
     // the feature cache output feature chunk by chunk.
-    // frame_chunk_size : num frame of a chunk.
-    // frame_chunk_stride: chunk sliding window stride.
-    feat_cache_opts.frame_chunk_stride = 1;
-    feat_cache_opts.frame_chunk_size = 1;
     ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
     LOG(INFO) << "fbank: " << true;
     LOG(INFO) << "feat dim: " << feature_cache.Dim();
diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
index 943b74b89..cc7a5e17c 100644
--- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
+++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc
@@ -66,10 +66,6 @@ int main(int argc, char* argv[]) {
 
     ppspeech::FeatureCacheOptions feat_cache_opts;
     // the feature cache output feature chunk by chunk.
-    // frame_chunk_size : num frame of a chunk.
-    // frame_chunk_stride: chunk sliding window stride.
-    feat_cache_opts.frame_chunk_stride = 1;
-    feat_cache_opts.frame_chunk_size = 1;
     ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn));
     LOG(INFO) << "feat dim: " << feature_cache.Dim();
 
diff --git a/speechx/speechx/frontend/audio/dispenser.cc b/speechx/speechx/frontend/audio/dispenser.cc
new file mode 100644
index 000000000..0e8cdc6f6
--- /dev/null
+++ b/speechx/speechx/frontend/audio/dispenser.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/audio/dispenser.h"
+
+namespace ppspeech {
+
+using kaldi::Vector;
+using kaldi::VectorBase;
+using kaldi::BaseFloat;
+using std::unique_ptr;
+
+Dispenser::Dispenser(DispenserOptions opts,
+                     unique_ptr<FrontendInterface> base_extractor) {
+    frame_chunk_stride_ = opts.frame_chunk_stride;
+    frame_chunk_size_ = opts.frame_chunk_size;
+    base_extractor_ = std::move(base_extractor);
+    dim_ = base_extractor_->Dim();
+}
+
+void Dispenser::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    // read inputs
+    base_extractor_->Accept(inputs);
+}
+
+// pop feature chunk
+bool Dispenser::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
+    feats->Resize(dim_ * frame_chunk_size_);
+    bool result = Compute(feats);
+    return result;
+}
+
+// read all data from base_feature_extractor_ into cache_
+bool Dispenser::Compute(Vector<BaseFloat>* feats) {
+    // compute and feed
+    bool result = false;
+    while (feature_cache_.size() < frame_chunk_size_) {
+        Vector<BaseFloat> feature;
+        result = base_extractor_->Read(&feature);
+        if (result == false || feature.Dim() == 0) return false;
+        feature_cache_.push(feature);
+    }
+
+    int32 counter = 0; 
+    int32 cache_size = frame_chunk_size_ - frame_chunk_stride_;
+    int32 elem_dim = base_extractor_->Dim();
+    while (counter < frame_chunk_size_) {
+      Vector<BaseFloat>& val = feature_cache_.front();
+      int32 start = counter * elem_dim;
+      feats->Range(start, elem_dim).CopyFromVec(val);
+      if (frame_chunk_size_ - counter <= cache_size ) {
+          feature_cache_.push(val);
+      }
+      feature_cache_.pop();
+      counter++;
+    }
+
+    return result;
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/dispenser.h b/speechx/speechx/frontend/audio/dispenser.h
new file mode 100644
index 000000000..89d9c977b
--- /dev/null
+++ b/speechx/speechx/frontend/audio/dispenser.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+
+namespace ppspeech {
+
+struct DispenserOptions {
+    int32 frame_chunk_size;
+    int32 frame_chunk_stride;
+    
+    DispenserOptions()
+        : frame_chunk_size(1),
+          frame_chunk_stride(1) {}
+};
+
+class Dispenser : public FrontendInterface {
+  public:
+    explicit Dispenser(
+        DispenserOptions opts,
+        std::unique_ptr<FrontendInterface> base_extractor = NULL);
+
+    // Feed feats or waves
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+
+    // feats size = num_frames * feat_dim
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+
+    // feat dim
+    virtual size_t Dim() const { return dim_; }
+
+    virtual void SetFinished() {
+        base_extractor_->SetFinished();
+    }
+
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+
+    virtual void Reset() {
+        base_extractor_->Reset();
+    }
+
+  private:
+    bool Compute(kaldi::Vector<kaldi::BaseFloat>* feats);
+
+    int32 dim_;
+    int32 frame_chunk_size_;    // window
+    int32 frame_chunk_stride_;  // stride
+    std::queue<kaldi::Vector<kaldi::BaseFloat>> feature_cache_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+    DISALLOW_COPY_AND_ASSIGN(Dispenser);
+};
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc
index 05283bb7e..930f29c54 100644
--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@@ -26,8 +26,6 @@ using std::unique_ptr;
 FeatureCache::FeatureCache(FeatureCacheOptions opts,
                            unique_ptr<FrontendInterface> base_extractor) {
     max_size_ = opts.max_size;
-    frame_chunk_stride_ = opts.frame_chunk_stride;
-    frame_chunk_size_ = opts.frame_chunk_size;
     timeout_ = opts.timeout;  // ms
     base_extractor_ = std::move(base_extractor);
     dim_ = base_extractor_->Dim();
@@ -74,24 +72,11 @@ bool FeatureCache::Compute() {
     bool result = base_extractor_->Read(&feature);
     if (result == false || feature.Dim() == 0) return false;
 
-    // join with remained
-    int32 joint_len = feature.Dim() + remained_feature_.Dim();
-    Vector<BaseFloat> joint_feature(joint_len);
-    joint_feature.Range(0, remained_feature_.Dim())
-        .CopyFromVec(remained_feature_);
-    joint_feature.Range(remained_feature_.Dim(), feature.Dim())
-        .CopyFromVec(feature);
-
-    // one by one, or stride with window
-    // controlled by frame_chunk_stride_ and frame_chunk_size_
-    int32 num_chunk =
-        ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
+    int32 num_chunk = feature.Dim() / dim_ ;
     for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
-        int32 start = chunk_idx * frame_chunk_stride_ * dim_;
-
-        Vector<BaseFloat> feature_chunk(frame_chunk_size_ * dim_);
-        SubVector<BaseFloat> tmp(joint_feature.Data() + start,
-                                 frame_chunk_size_ * dim_);
+        int32 start = chunk_idx *  dim_;
+        Vector<BaseFloat> feature_chunk(dim_);
+        SubVector<BaseFloat> tmp(feature.Data() + start, dim_);
         feature_chunk.CopyFromVec(tmp);
 
         std::unique_lock<std::mutex> lock(mutex_);
@@ -104,13 +89,6 @@ bool FeatureCache::Compute() {
         cache_.push(feature_chunk);
         ready_read_condition_.notify_one();
     }
-
-    // cache remained feats
-    int32 remained_feature_len =
-        joint_len - num_chunk * frame_chunk_stride_ * dim_;
-    remained_feature_.Resize(remained_feature_len);
-    remained_feature_.CopyFromVec(joint_feature.Range(
-        frame_chunk_stride_ * num_chunk * dim_, remained_feature_len));
     return result;
 }
 
diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h
index 0dc704bbf..4c016056a 100644
--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@@ -21,13 +21,9 @@ namespace ppspeech {
 
 struct FeatureCacheOptions {
     int32 max_size;
-    int32 frame_chunk_size;
-    int32 frame_chunk_stride;
     int32 timeout;  // ms
     FeatureCacheOptions()
         : max_size(kint16max),
-          frame_chunk_size(1),
-          frame_chunk_stride(1),
           timeout(1) {}
 };
 
@@ -80,7 +76,7 @@ class FeatureCache : public FrontendInterface {
     std::condition_variable ready_feed_condition_;
     std::condition_variable ready_read_condition_;
 
-    // DISALLOW_COPY_AND_ASSGIN(FeatureCache);
+    DISALLOW_COPY_AND_ASSIGN(FeatureCache);
 };
 
 }  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 087de0f0d..026905f06 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -35,8 +35,11 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
     unique_ptr<FrontendInterface> cmvn(
         new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));
 
-    base_extractor_.reset(
+    unique_ptr<FrontendInterface> cache(
         new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn)));
+
+    base_extractor_.reset(
+        new ppspeech::Dispenser(opts.dispenser_opts, std::move(cache)));
 }
 
 }  // ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index 6b9b4795e..9f86c634c 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -23,6 +23,7 @@
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/linear_spectrogram.h"
 #include "frontend/audio/normalizer.h"
+#include "frontend/audio/dispenser.h"
 
 namespace ppspeech {
 
@@ -33,13 +34,16 @@ struct FeaturePipelineOptions {
     LinearSpectrogramOptions linear_spectrogram_opts;
     FbankOptions fbank_opts;
     FeatureCacheOptions feature_cache_opts;
+    DispenserOptions dispenser_opts;
+
     FeaturePipelineOptions()
         : cmvn_file(""),
           to_float32(false),  // true, only for linear feature
           use_fbank(true),
           linear_spectrogram_opts(),
           fbank_opts(),
-          feature_cache_opts() {}
+          feature_cache_opts(),
+          dispenser_opts() {}
 };
 
 class FeaturePipeline : public FrontendInterface {

From 10819e0fa23a9e8a4c63d9e2ec82f9591a539c2f Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 30 May 2022 13:57:52 +0000
Subject: [PATCH 36/40] not install ctc on win, test=asr

---
 paddlespeech/s2t/models/ds2/__init__.py        | 4 +++-
 paddlespeech/s2t/models/ds2_online/__init__.py | 4 +++-
 paddlespeech/s2t/modules/ctc.py                | 4 +++-
 paddlespeech/s2t/transform/perturb.py          | 3 ++-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/s2t/models/ds2/__init__.py b/paddlespeech/s2t/models/ds2/__init__.py
index b32220673..0a5c50d86 100644
--- a/paddlespeech/s2t/models/ds2/__init__.py
+++ b/paddlespeech/s2t/models/ds2/__init__.py
@@ -14,13 +14,15 @@
 from .deepspeech2 import DeepSpeech2InferModel
 from .deepspeech2 import DeepSpeech2Model
 from paddlespeech.s2t.utils import dynamic_pip_install
+import sys
 
 try:
     import paddlespeech_ctcdecoders
 except ImportError:
     try:
         package_name = 'paddlespeech_ctcdecoders'
-        dynamic_pip_install.install(package_name)
+        if sys.platform != "win32":
+            dynamic_pip_install.install(package_name)
     except Exception:
         raise RuntimeError(
             "Can not install package paddlespeech_ctcdecoders on your system. \
diff --git a/paddlespeech/s2t/models/ds2_online/__init__.py b/paddlespeech/s2t/models/ds2_online/__init__.py
index c5fdab1bc..de772b645 100644
--- a/paddlespeech/s2t/models/ds2_online/__init__.py
+++ b/paddlespeech/s2t/models/ds2_online/__init__.py
@@ -14,13 +14,15 @@
 from .deepspeech2 import DeepSpeech2InferModelOnline
 from .deepspeech2 import DeepSpeech2ModelOnline
 from paddlespeech.s2t.utils import dynamic_pip_install
+import sys
 
 try:
     import paddlespeech_ctcdecoders
 except ImportError:
     try:
         package_name = 'paddlespeech_ctcdecoders'
-        dynamic_pip_install.install(package_name)
+        if sys.platform != "win32":
+            dynamic_pip_install.install(package_name)
     except Exception:
         raise RuntimeError(
             "Can not install package paddlespeech_ctcdecoders on your system. \
diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py
index 33ad472de..ca576eef1 100644
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@@ -22,6 +22,7 @@ from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.loss import CTCLoss
 from paddlespeech.s2t.utils import ctc_utils
 from paddlespeech.s2t.utils.log import Log
+import sys
 
 logger = Log(__name__).getlog()
 
@@ -34,7 +35,8 @@ except ImportError:
     try:
         from paddlespeech.s2t.utils import dynamic_pip_install
         package_name = 'paddlespeech_ctcdecoders'
-        dynamic_pip_install.install(package_name)
+        if sys.platform != "win32":
+            dynamic_pip_install.install(package_name)
         from paddlespeech.s2t.decoders.ctcdecoder import ctc_beam_search_decoding_batch  # noqa: F401
         from paddlespeech.s2t.decoders.ctcdecoder import ctc_greedy_decoding  # noqa: F401
         from paddlespeech.s2t.decoders.ctcdecoder import Scorer  # noqa: F401
diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py
index 9e41b824b..b18caefb8 100644
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -154,7 +154,8 @@ class SpeedPerturbationSox():
                 package = "sox"
                 dynamic_pip_install.install(package)
                 package = "soxbindings"
-                dynamic_pip_install.install(package)
+                if sys.platform != "win32":
+                    dynamic_pip_install.install(package)
                 import soxbindings as sox
             except Exception:
                 raise RuntimeError(

From 952b1a145110da49d667562bde8c9f27e8550ac9 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Tue, 31 May 2022 08:41:22 +0800
Subject: [PATCH 37/40] rename, test=doc

---
 speechx/speechx/decoder/param.h                      |  6 +++---
 speechx/speechx/frontend/audio/CMakeLists.txt        |  2 +-
 .../frontend/audio/{dispenser.cc => assembler.cc}    | 12 ++++++------
 .../frontend/audio/{dispenser.h => assembler.h}      | 12 ++++++------
 speechx/speechx/frontend/audio/feature_pipeline.cc   |  4 ++--
 speechx/speechx/frontend/audio/feature_pipeline.h    |  8 ++++----
 6 files changed, 22 insertions(+), 22 deletions(-)
 rename speechx/speechx/frontend/audio/{dispenser.cc => assembler.cc} (87%)
 rename speechx/speechx/frontend/audio/{dispenser.h => assembler.h} (90%)

diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index ae7595682..495e5236c 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -81,8 +81,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
         frame_opts.preemph_coeff = 0.0;
         opts.linear_spectrogram_opts.frame_opts = frame_opts;
     }
-    opts.dispenser_opts.frame_chunk_size = FLAGS_receptive_field_length;
-    opts.dispenser_opts.frame_chunk_stride = FLAGS_downsampling_rate;
+    opts.assembler_opts.frame_chunk_size = FLAGS_receptive_field_length;
+    opts.assembler_opts.frame_chunk_stride = FLAGS_downsampling_rate;
     return opts;
 }
 
@@ -115,4 +115,4 @@ RecognizerResource InitRecognizerResoure() {
     resource.tlg_opts = InitDecoderOptions();
     return resource;
 }
-}
\ No newline at end of file
+}
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index ee7c05c4c..8ae63256a 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -8,7 +8,7 @@ add_library(frontend STATIC
   feature_cache.cc
   feature_pipeline.cc
   fbank.cc
-  dispenser.cc
+  assembler.cc
 )
 target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)
 
diff --git a/speechx/speechx/frontend/audio/dispenser.cc b/speechx/speechx/frontend/audio/assembler.cc
similarity index 87%
rename from speechx/speechx/frontend/audio/dispenser.cc
rename to speechx/speechx/frontend/audio/assembler.cc
index 0e8cdc6f6..47e0705b9 100644
--- a/speechx/speechx/frontend/audio/dispenser.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "frontend/audio/dispenser.h"
+#include "frontend/audio/assembler.h"
 
 namespace ppspeech {
 
@@ -21,7 +21,7 @@ using kaldi::VectorBase;
 using kaldi::BaseFloat;
 using std::unique_ptr;
 
-Dispenser::Dispenser(DispenserOptions opts,
+Assembler::Assembler(AssemblerOptions opts,
                      unique_ptr<FrontendInterface> base_extractor) {
     frame_chunk_stride_ = opts.frame_chunk_stride;
     frame_chunk_size_ = opts.frame_chunk_size;
@@ -29,20 +29,20 @@ Dispenser::Dispenser(DispenserOptions opts,
     dim_ = base_extractor_->Dim();
 }
 
-void Dispenser::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+void Assembler::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
     // read inputs
     base_extractor_->Accept(inputs);
 }
 
 // pop feature chunk
-bool Dispenser::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
+bool Assembler::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
     feats->Resize(dim_ * frame_chunk_size_);
     bool result = Compute(feats);
     return result;
 }
 
 // read all data from base_feature_extractor_ into cache_
-bool Dispenser::Compute(Vector<BaseFloat>* feats) {
+bool Assembler::Compute(Vector<BaseFloat>* feats) {
     // compute and feed
     bool result = false;
     while (feature_cache_.size() < frame_chunk_size_) {
@@ -69,4 +69,4 @@ bool Dispenser::Compute(Vector<BaseFloat>* feats) {
     return result;
 }
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/dispenser.h b/speechx/speechx/frontend/audio/assembler.h
similarity index 90%
rename from speechx/speechx/frontend/audio/dispenser.h
rename to speechx/speechx/frontend/audio/assembler.h
index 89d9c977b..4397d3f6d 100644
--- a/speechx/speechx/frontend/audio/dispenser.h
+++ b/speechx/speechx/frontend/audio/assembler.h
@@ -19,19 +19,19 @@
 
 namespace ppspeech {
 
-struct DispenserOptions {
+struct AssemblerOptions {
     int32 frame_chunk_size;
     int32 frame_chunk_stride;
     
-    DispenserOptions()
+    AssemblerOptions()
         : frame_chunk_size(1),
           frame_chunk_stride(1) {}
 };
 
-class Dispenser : public FrontendInterface {
+class Assembler : public FrontendInterface {
   public:
-    explicit Dispenser(
-        DispenserOptions opts,
+    explicit Assembler(
+        AssemblerOptions opts,
         std::unique_ptr<FrontendInterface> base_extractor = NULL);
 
     // Feed feats or waves
@@ -61,7 +61,7 @@ class Dispenser : public FrontendInterface {
     int32 frame_chunk_stride_;  // stride
     std::queue<kaldi::Vector<kaldi::BaseFloat>> feature_cache_;
     std::unique_ptr<FrontendInterface> base_extractor_;
-    DISALLOW_COPY_AND_ASSIGN(Dispenser);
+    DISALLOW_COPY_AND_ASSIGN(Assembler);
 };
 
 }  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 026905f06..9cacff9f7 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -39,7 +39,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
         new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn)));
 
     base_extractor_.reset(
-        new ppspeech::Dispenser(opts.dispenser_opts, std::move(cache)));
+        new ppspeech::Assembler(opts.assembler_opts, std::move(cache)));
 }
 
-}  // ppspeech
\ No newline at end of file
+}  // ppspeech
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index 9f86c634c..b848f548b 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -23,7 +23,7 @@
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/linear_spectrogram.h"
 #include "frontend/audio/normalizer.h"
-#include "frontend/audio/dispenser.h"
+#include "frontend/audio/assembler.h"
 
 namespace ppspeech {
 
@@ -34,7 +34,7 @@ struct FeaturePipelineOptions {
     LinearSpectrogramOptions linear_spectrogram_opts;
     FbankOptions fbank_opts;
     FeatureCacheOptions feature_cache_opts;
-    DispenserOptions dispenser_opts;
+    AssemblerOptions assembler_opts;
 
     FeaturePipelineOptions()
         : cmvn_file(""),
@@ -43,7 +43,7 @@ struct FeaturePipelineOptions {
           linear_spectrogram_opts(),
           fbank_opts(),
           feature_cache_opts(),
-          dispenser_opts() {}
+          assembler_opts() {}
 };
 
 class FeaturePipeline : public FrontendInterface {
@@ -63,4 +63,4 @@ class FeaturePipeline : public FrontendInterface {
   private:
     std::unique_ptr<FrontendInterface> base_extractor_;
 };
-}
\ No newline at end of file
+}

From 6c7ed42712b3a2d52af48da9bba8453bae0b0370 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 31 May 2022 03:10:07 +0000
Subject: [PATCH 38/40] fix ljspeech readme, test=doc

---
 examples/ljspeech/tts3/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index fb1f9f4f7..81a0580c0 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -107,7 +107,7 @@ pwg_ljspeech_ckpt_0.5
 ```bash
 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
-``text
+```text
 usage: synthesize.py [-h]
                      [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]

From 6436f343bb7a40db7c63ca9da4ebe61ce3a1c76c Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Tue, 31 May 2022 11:26:03 +0800
Subject: [PATCH 39/40] Fix asr_inference server engine.

---
 paddlespeech/server/engine/asr/paddleinference/asr_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
index 80e323fa0..b030293f7 100644
--- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@@ -37,7 +37,7 @@ class ASRServerExecutor(ASRExecutor):
     def __init__(self):
         super().__init__()
         self.task_resource = CommonTaskResource(
-            task='asr', model_format='static', inference_mode='online')
+            task='asr', model_format='static')
 
     def _init_from_path(self,
                         model_type: str='wenetspeech',

From 46690b1b3cc4f2df32fa6930564ff69c8113ddc9 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Tue, 31 May 2022 15:50:46 +0800
Subject: [PATCH 40/40] Fix windows issue in paddlespeech.resource

---
 paddlespeech/resource/resource.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/paddlespeech/resource/resource.py b/paddlespeech/resource/resource.py
index f00b1b3b0..fff5f745d 100644
--- a/paddlespeech/resource/resource.py
+++ b/paddlespeech/resource/resource.py
@@ -82,6 +82,7 @@ class CommonTaskResource:
             self.model_tag = model_tag
             self.version = version
             self.res_dict = self.pretrained_models[model_tag][version]
+            self.format_path(self.res_dict)
             self.res_dir = self._fetch(self.res_dict,
                                        self._get_model_dir(model_type))
         else:
@@ -89,9 +90,19 @@ class CommonTaskResource:
             self.voc_model_tag = model_tag
             self.voc_version = version
             self.voc_res_dict = self.pretrained_models[model_tag][version]
+            self.format_path(self.voc_res_dict)
             self.voc_res_dir = self._fetch(self.voc_res_dict,
                                            self._get_model_dir(model_type))
 
+    @staticmethod
+    def format_path(res_dict: Dict[str, str]):
+        for k, v in res_dict.items():
+            if '/' in v:
+                if v.startswith('https://') or v.startswith('http://'):
+                    continue
+                else:
+                    res_dict[k] = os.path.join(*(v.split('/')))
+
     @staticmethod
     def get_model_class(model_name) -> List[object]:
         """Dynamic import model class.