Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into dev

4 years ago · 4e7f5a8b7b
parent d36d742b0f bb75735fac
commit 4e7f5a8b7b
490 changed files with 179619 additions and 232 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,10 @@
 *.npz
 *.done
 *.whl
+*.egg-info
+build
+
+docs/build/

 tools/venv
 tools/kenlm
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -0,0 +1,30 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/src/conf.py
+
+# Build documentation with MkDocs
+#mkdocs:
+#  configuration: mkdocs.yml
+
+# Optionally build your docs in additional formats such as PDF
+formats: []
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  version: 3.7
+  install:
+    - method: pip
+      path: .
+      extra_requirements:
+        - doc
+    
+    - requirements: docs/requirements.txt
+
+
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# PaddlePaddle Speech to Any toolkit
+# PaddlePaddle Speech toolkit

 ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
 ![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
@ -9,7 +9,7 @@

 ## Features

- See [feature list](docs/source/feature_list.md) for more information.
+ See [feature list](docs/source/asr/feature_list.md) for more information.

 ## Setup

@ -18,20 +18,20 @@ All tested under:
 * python>=3.7
 * paddlepaddle==2.1.2

-Please see [install](docs/source/install.md).
+Please see [install](docs/source/asr/install.md).

 ## Getting Started

-Please see [Getting Started](docs/source/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
+Please see [Getting Started](docs/source/asr/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).


 ## More Information  

-* [Data Prepration](docs/source/data_preparation.md)  
-* [Data Augmentation](docs/source/augmentation.md)  
-* [Ngram LM](docs/source/ngram_lm.md)  
-* [Benchmark](docs/source/benchmark.md)  
-* [Relased Model](docs/source/released_model.md)  
+* [Data Prepration](docs/source/asr/data_preparation.md)  
+* [Data Augmentation](docs/source/asr/augmentation.md)  
+* [Ngram LM](docs/source/asr/ngram_lm.md)  
+* [Benchmark](docs/source/asr/benchmark.md)  
+* [Relased Model](docs/source/asr/released_model.md)  


 ## Questions and Help
@ -45,4 +45,4 @@ DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).

 ## Acknowledgement

-We depends on many open source repos. See [References](docs/source/reference.md) for more information.
+We depends on many open source repos. See [References](docs/source/asr/reference.md) for more information.
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -355,6 +355,8 @@ if not hasattr(paddle.Tensor, 'tolist'):
    setattr(paddle.Tensor, 'tolist', tolist)


+
+########### hcak paddle.nn.functional #############
 # hack loss
 def ctc_loss(logits,
             labels,
@ -381,3 +383,152 @@ logger.debug(
    "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
 )
 F.ctc_loss = ctc_loss
+
+
+########### hcak paddle.nn #############
+from paddle.nn import Layer
+from typing import Optional
+from typing import Mapping
+from typing import Iterable
+from typing import Tuple
+from typing import Iterator
+from collections import OrderedDict, abc as container_abcs
+
+class LayerDict(paddle.nn.Layer):
+    r"""Holds submodules in a dictionary.
+
+    :class:`~paddle.nn.LayerDict` can be indexed like a regular Python dictionary,
+    but modules it contains are properly registered, and will be visible by all
+    :class:`~paddle.nn.Layer` methods.
+
+    :class:`~paddle.nn.LayerDict` is an **ordered** dictionary that respects
+
+    * the order of insertion, and
+
+    * in :meth:`~paddle.nn.LayerDict.update`, the order of the merged
+      ``OrderedDict``, ``dict`` (started from Python 3.6) or another
+      :class:`~paddle.nn.LayerDict` (the argument to
+      :meth:`~paddle.nn.LayerDict.update`).
+
+    Note that :meth:`~paddle.nn.LayerDict.update` with other unordered mapping
+    types (e.g., Python's plain ``dict`` before Python version 3.6) does not
+    preserve the order of the merged mapping.
+
+    Args:
+        modules (iterable, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module)
+
+    Example::
+
+        class MyModule(nn.Layer):
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.choices = nn.LayerDict({
+                        'conv': nn.Conv2d(10, 10, 3),
+                        'pool': nn.MaxPool2d(3)
+                })
+                self.activations = nn.LayerDict([
+                        ['lrelu', nn.LeakyReLU()],
+                        ['prelu', nn.PReLU()]
+                ])
+
+            def forward(self, x, choice, act):
+                x = self.choices[choice](x)
+                x = self.activations[act](x)
+                return x
+    """
+
+    def __init__(self, modules: Optional[Mapping[str, Layer]] = None) -> None:
+        super(LayerDict, self).__init__()
+        if modules is not None:
+            self.update(modules)
+
+    def __getitem__(self, key: str) -> Layer:
+        return self._modules[key]
+
+    def __setitem__(self, key: str, module: Layer) -> None:
+        self.add_module(key, module)
+
+    def __delitem__(self, key: str) -> None:
+        del self._modules[key]
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._modules)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._modules
+
+    def clear(self) -> None:
+        """Remove all items from the LayerDict.
+        """
+        self._modules.clear()
+
+    def pop(self, key: str) -> Layer:
+        r"""Remove key from the LayerDict and return its module.
+
+        Args:
+            key (string): key to pop from the LayerDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self) -> Iterable[str]:
+        r"""Return an iterable of the LayerDict keys.
+        """
+        return self._modules.keys()
+
+    def items(self) -> Iterable[Tuple[str, Layer]]:
+        r"""Return an iterable of the LayerDict key/value pairs.
+        """
+        return self._modules.items()
+
+    def values(self) -> Iterable[Layer]:
+        r"""Return an iterable of the LayerDict values.
+        """
+        return self._modules.values()
+
+    def update(self, modules: Mapping[str, Layer]) -> None:
+        r"""Update the :class:`~paddle.nn.LayerDict` with the key-value pairs from a
+        mapping or an iterable, overwriting existing keys.
+
+        .. note::
+            If :attr:`modules` is an ``OrderedDict``, a :class:`~paddle.nn.LayerDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+
+        Args:
+            modules (iterable): a mapping (dictionary) from string to :class:`~paddle.nn.Layer`,
+                or an iterable of key-value pairs of type (string, :class:`~paddle.nn.Layer`)
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError("LayerDict.update should be called with an "
+                            "iterable of key/value pairs, but got " +
+                            type(modules).__name__)
+
+        if isinstance(modules, (OrderedDict, LayerDict, container_abcs.Mapping)):
+            for key, module in modules.items():
+                self[key] = module
+        else:
+            # modules here can be a list with two items
+            for j, m in enumerate(modules):
+                if not isinstance(m, container_abcs.Iterable):
+                    raise TypeError("LayerDict update sequence element "
+                                    "#" + str(j) + " should be Iterable; is" +
+                                    type(m).__name__)
+                if not len(m) == 2:
+                    raise ValueError("LayerDict update sequence element "
+                                     "#" + str(j) + " has length " + str(len(m)) +
+                                     "; 2 is required")
+                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
+                # that's too cumbersome to type correctly with overloads, so we add an ignore here
+                self[m[0]] = m[1]  # type: ignore[assignment]
+
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+
+if not hasattr(paddle.nn, 'LayerDict'):
+    logger.debug(
+        "register user LayerDict to paddle.nn, remove this when fixed!")
+    setattr(paddle.nn, 'LayerDict', LayerDict)
--- a/deepspeech/decoders/beam_search.py
+++ b/deepspeech/decoders/beam_search.py
@ -0,0 +1,528 @@
+"""Beam search module."""
+
+from itertools import chain
+import logger
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import NamedTuple
+from typing import Tuple
+from typing import Union
+
+import paddle
+
+from .utils import end_detect
+from .scorers.scorer_interface import PartialScorerInterface
+from .scorers.scorer_interface import ScorerInterface
+
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+class Hypothesis(NamedTuple):
+    """Hypothesis data type."""
+
+    yseq: paddle.Tensor # (T,)
+    score: Union[float, paddle.Tensor] = 0
+    scores: Dict[str, Union[float, paddle.Tensor]] = dict()
+    states: Dict[str, Any] = dict()
+
+    def asdict(self) -> dict:
+        """Convert data to JSON-friendly dict."""
+        return self._replace(
+            yseq=self.yseq.tolist(),
+            score=float(self.score),
+            scores={k: float(v) for k, v in self.scores.items()},
+        )._asdict()
+
+
+class BeamSearch(paddle.nn.Layer):
+    """Beam search implementation."""
+
+    def __init__(
+        self,
+        scorers: Dict[str, ScorerInterface],
+        weights: Dict[str, float],
+        beam_size: int,
+        vocab_size: int,
+        sos: int,
+        eos: int,
+        token_list: List[str] = None,
+        pre_beam_ratio: float = 1.5,
+        pre_beam_score_key: str = None,
+    ):
+        """Initialize beam search.
+
+        Args:
+            scorers (dict[str, ScorerInterface]): Dict of decoder modules
+                e.g., Decoder, CTCPrefixScorer, LM
+                The scorer will be ignored if it is `None`
+            weights (dict[str, float]): Dict of weights for each scorers
+                The scorer will be ignored if its weight is 0
+            beam_size (int): The number of hypotheses kept during search
+            vocab_size (int): The number of vocabulary
+            sos (int): Start of sequence id
+            eos (int): End of sequence id
+            token_list (list[str]): List of tokens for debug log
+            pre_beam_score_key (str): key of scores to perform pre-beam search
+            pre_beam_ratio (float): beam size in the pre-beam search
+                will be `int(pre_beam_ratio * beam_size)`
+
+        """
+        super().__init__()
+        # set scorers
+        self.weights = weights
+        self.scorers = dict() # all = full + partial
+        self.full_scorers = dict() # full tokens
+        self.part_scorers = dict() # partial tokens
+        # this module dict is required for recursive cast
+        # `self.to(device, dtype)` in `recog.py`
+        self.nn_dict = paddle.nn.LayerDict() # nn.Layer
+        for k, v in scorers.items():
+            w = weights.get(k, 0)
+            if w == 0 or v is None:
+                continue
+            assert isinstance(
+                v, ScorerInterface
+            ), f"{k} ({type(v)}) does not implement ScorerInterface"
+            self.scorers[k] = v
+            if isinstance(v, PartialScorerInterface):
+                self.part_scorers[k] = v
+            else:
+                self.full_scorers[k] = v
+            if isinstance(v, paddle.nn.Layer):
+                self.nn_dict[k] = v
+
+        # set configurations
+        self.sos = sos
+        self.eos = eos
+        self.token_list = token_list
+        # pre_beam_size > beam_size
+        self.pre_beam_size = int(pre_beam_ratio * beam_size)
+        self.beam_size = beam_size
+        self.n_vocab = vocab_size
+        if (
+            pre_beam_score_key is not None
+            and pre_beam_score_key != "full"
+            and pre_beam_score_key not in self.full_scorers
+        ):
+            raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
+        # selected `key` scorer to do pre beam search
+        self.pre_beam_score_key = pre_beam_score_key
+        # do_pre_beam when need, valid and has part_scorers
+        self.do_pre_beam = (
+            self.pre_beam_score_key is not None
+            and self.pre_beam_size < self.n_vocab
+            and len(self.part_scorers) > 0
+        )
+
+    def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]:
+        """Get an initial hypothesis data.
+
+        Args:
+            x (paddle.Tensor): The encoder output feature, (T, D)
+
+        Returns:
+            Hypothesis: The initial hypothesis.
+
+        """
+        init_states = dict()
+        init_scores = dict()
+        for k, d in self.scorers.items():
+            init_states[k] = d.init_state(x)
+            init_scores[k] = 0.0
+        return [
+            Hypothesis(
+                yseq=paddle.to_tensor([self.sos], place=x.place),
+                score=0.0,
+                scores=init_scores,
+                states=init_states,
+            )
+        ]
+
+    @staticmethod
+    def append_token(xs: paddle.Tensor, x: int) -> paddle.Tensor:
+        """Append new token to prefix tokens.
+
+        Args:
+            xs (paddle.Tensor): The prefix token, (T,)
+            x (int): The new token to append
+
+        Returns:
+            paddle.Tensor: (T+1,), New tensor contains: xs + [x] with xs.dtype and xs.device
+
+        """
+        x = paddle.to_tensor([x], dtype=xs.dtype, place=xs.place)
+        return paddle.cat((xs, x))
+
+    def score_full(
+        self, hyp: Hypothesis, x: paddle.Tensor
+    ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            x (paddle.Tensor): Corresponding input feature, (T, D)
+
+        Returns:
+            Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.full_scorers.items():
+            # scores[k] shape (self.n_vocab,)
+            scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
+        return scores, states
+
+    def score_partial(
+        self, hyp: Hypothesis, ids: paddle.Tensor, x: paddle.Tensor
+    ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.part_scorers`.
+
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            ids (paddle.Tensor): 1D tensor of new partial tokens to score, 
+                len(ids) < n_vocab
+            x (paddle.Tensor): Corresponding input feature, (T, D)
+
+        Returns:
+            Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.part_scorers`
+                and tensor score values of shape: `(len(ids),)`,
+                and state dict that has string keys
+                and state values of `self.part_scorers`
+
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.part_scorers.items():
+            # scores[k] shape (len(ids),)
+            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
+        return scores, states
+
+    def beam(
+        self, weighted_scores: paddle.Tensor, ids: paddle.Tensor
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute topk full token ids and partial token ids.
+
+        Args:
+            weighted_scores (paddle.Tensor): The weighted sum scores for each tokens.
+                Its shape is `(self.n_vocab,)`.
+            ids (paddle.Tensor): The partial token ids(Global) to compute topk.
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]: 
+                The topk full token ids and partial token ids.
+                Their shapes are `(self.beam_size,)`.
+                i.e. (global ids, global relative local ids).
+
+        """
+        # no pre beam performed, `ids` equal to `weighted_scores`
+        if weighted_scores.size(0) == ids.size(0):
+            top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
+            return top_ids, top_ids
+
+        # mask pruned in pre-beam not to select in topk
+        tmp = weighted_scores[ids]
+        weighted_scores[:] = -float("inf")
+        weighted_scores[ids] = tmp
+        # top_ids no equal to local_ids, since ids shape not same
+        top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
+        local_ids = weighted_scores[ids].topk(self.beam_size)[1] # index in len(ids)
+        return top_ids, local_ids
+
+    @staticmethod
+    def merge_scores(
+        prev_scores: Dict[str, float],
+        next_full_scores: Dict[str, paddle.Tensor],
+        full_idx: int,
+        next_part_scores: Dict[str, paddle.Tensor],
+        part_idx: int,
+    ) -> Dict[str, paddle.Tensor]:
+        """Merge scores for new hypothesis.
+
+        Args:
+            prev_scores (Dict[str, float]):
+                The previous hypothesis scores by `self.scorers`
+            next_full_scores (Dict[str, paddle.Tensor]): scores by `self.full_scorers`
+            full_idx (int): The next token id for `next_full_scores`
+            next_part_scores (Dict[str, paddle.Tensor]):
+                scores of partial tokens by `self.part_scorers`
+            part_idx (int): The new token id for `next_part_scores`
+
+        Returns:
+            Dict[str, paddle.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are scalar tensors by the scorers.
+
+        """
+        new_scores = dict()
+        for k, v in next_full_scores.items():
+            new_scores[k] = prev_scores[k] + v[full_idx]
+        for k, v in next_part_scores.items():
+            new_scores[k] = prev_scores[k] + v[part_idx]
+        return new_scores
+
+    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
+        """Merge states for new hypothesis.
+
+        Args:
+            states: states of `self.full_scorers`
+            part_states: states of `self.part_scorers`
+            part_idx (int): The new token id for `part_scores`
+
+        Returns:
+            Dict[str, paddle.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and `self.part_scorers`.
+                Its values are states of the scorers.
+
+        """
+        new_states = dict()
+        for k, v in states.items():
+            new_states[k] = v
+        for k, d in self.part_scorers.items():
+            new_states[k] = d.select_state(part_states[k], part_idx)
+        return new_states
+
+    def search(
+        self, running_hyps: List[Hypothesis], x: paddle.Tensor
+    ) -> List[Hypothesis]:
+        """Search new tokens for running hypotheses and encoded speech x.
+
+        Args:
+            running_hyps (List[Hypothesis]): Running hypotheses on beam
+            x (paddle.Tensor): Encoded speech feature (T, D)
+
+        Returns:
+            List[Hypotheses]: Best sorted hypotheses
+
+        """
+        best_hyps = []
+        part_ids = paddle.arange(self.n_vocab)  # no pre-beam
+        for hyp in running_hyps:
+            # scoring
+            weighted_scores = paddle.zeros(self.n_vocab, dtype=x.dtype)
+            scores, states = self.score_full(hyp, x)
+            for k in self.full_scorers:
+                weighted_scores += self.weights[k] * scores[k]
+            # partial scoring
+            if self.do_pre_beam:
+                pre_beam_scores = (
+                    weighted_scores
+                    if self.pre_beam_score_key == "full"
+                    else scores[self.pre_beam_score_key]
+                )
+                part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1]
+            part_scores, part_states = self.score_partial(hyp, part_ids, x)
+            for k in self.part_scorers:
+                weighted_scores[part_ids] += self.weights[k] * part_scores[k]
+            # add previous hyp score
+            weighted_scores += hyp.score
+
+            # update hyps
+            for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
+                # `part_j` is `j` relative id in `part_scores`
+                # will be (2 x beam at most)
+                best_hyps.append(
+                    Hypothesis(
+                        score=weighted_scores[j],
+                        yseq=self.append_token(hyp.yseq, j),
+                        scores=self.merge_scores(
+                            hyp.scores, scores, j, part_scores, part_j
+                        ),
+                        states=self.merge_states(states, part_states, part_j),
+                    )
+                )
+
+            # sort and prune 2 x beam -> beam
+            best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
+                : min(len(best_hyps), self.beam_size)
+            ]
+        return best_hyps
+
+    def forward(
+        self, x: paddle.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
+    ) -> List[Hypothesis]:
+        """Perform beam search.
+
+        Args:
+            x (paddle.Tensor): Encoded speech feature (T, D)
+            maxlenratio (float): Input length ratio to obtain max output length.
+                If maxlenratio=0.0 (default), it uses a end-detect function
+                    to automatically find maximum hypothesis lengths
+                If maxlenratio<0.0, its absolute value is interpreted
+                    as a constant max output length.
+            minlenratio (float): Input length ratio to obtain min output length.
+
+        Returns:
+            list[Hypothesis]: N-best decoding results
+
+        """
+        # set length bounds
+        if maxlenratio == 0:
+            maxlen = x.shape[0]
+        elif maxlenratio < 0:
+            maxlen = -1 * int(maxlenratio)
+        else:
+            maxlen = max(1, int(maxlenratio * x.size(0)))
+        minlen = int(minlenratio * x.size(0))
+        logger.info("decoder input length: " + str(x.shape[0]))
+        logger.info("max output length: " + str(maxlen))
+        logger.info("min output length: " + str(minlen))
+
+        # main loop of prefix search
+        running_hyps = self.init_hyp(x)
+        ended_hyps = []
+        for i in range(maxlen):
+            logger.debug("position " + str(i))
+            best = self.search(running_hyps, x)
+            # post process of one iteration
+            running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
+            # end detection
+            if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
+                logger.info(f"end detected at {i}")
+                break
+            if len(running_hyps) == 0:
+                logger.info("no hypothesis. Finish decoding.")
+                break
+            else:
+                logger.debug(f"remained hypotheses: {len(running_hyps)}")
+
+        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
+        # check the number of hypotheses reaching to eos
+        if len(nbest_hyps) == 0:
+            logger.warning(
+                "there is no N-best results, perform recognition "
+                "again with smaller minlenratio."
+            )
+            return (
+                []
+                if minlenratio < 0.1
+                else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
+            )
+
+        # report the best result
+        best = nbest_hyps[0]
+        for k, v in best.scores.items():
+            logger.info(
+                f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
+            )
+        logger.info(f"total log probability: {best.score:.2f}")
+        logger.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
+        logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
+        if self.token_list is not None:
+            logger.info(
+                "best hypo: "
+                + "".join([self.token_list[x] for x in best.yseq[1:-1]])
+                + "\n"
+            )
+        return nbest_hyps
+
+    def post_process(
+        self,
+        i: int,
+        maxlen: int,
+        maxlenratio: float,
+        running_hyps: List[Hypothesis],
+        ended_hyps: List[Hypothesis],
+    ) -> List[Hypothesis]:
+        """Perform post-processing of beam search iterations.
+
+        Args:
+            i (int): The length of hypothesis tokens.
+            maxlen (int): The maximum length of tokens in beam search.
+            maxlenratio (int): The maximum length ratio in beam search.
+            running_hyps (List[Hypothesis]): The running hypotheses in beam search.
+            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
+
+        Returns:
+            List[Hypothesis]: The new running hypotheses.
+
+        """
+        logger.debug(f"the number of running hypotheses: {len(running_hyps)}")
+        if self.token_list is not None:
+            logger.debug(
+                "best hypo: "
+                + "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]])
+            )
+        # add eos in the final loop to avoid that there are no ended hyps
+        if i == maxlen - 1:
+            logger.info("adding <eos> in the last position in the loop")
+            running_hyps = [
+                h._replace(yseq=self.append_token(h.yseq, self.eos))
+                for h in running_hyps
+            ]
+
+        # add ended hypotheses to a final list, and removed them from current hypotheses
+        # (this will be a problem, number of hyps < beam)
+        remained_hyps = []
+        for hyp in running_hyps:
+            if hyp.yseq[-1] == self.eos:
+                # e.g., Word LM needs to add final <eos> score
+                for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
+                    s = d.final_score(hyp.states[k])
+                    hyp.scores[k] += s
+                    hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
+                ended_hyps.append(hyp)
+            else:
+                remained_hyps.append(hyp)
+        return remained_hyps
+
+
+def beam_search(
+    x: paddle.Tensor,
+    sos: int,
+    eos: int,
+    beam_size: int,
+    vocab_size: int,
+    scorers: Dict[str, ScorerInterface],
+    weights: Dict[str, float],
+    token_list: List[str] = None,
+    maxlenratio: float = 0.0,
+    minlenratio: float = 0.0,
+    pre_beam_ratio: float = 1.5,
+    pre_beam_score_key: str = "full",
+) -> list:
+    """Perform beam search with scorers.
+
+    Args:
+        x (paddle.Tensor): Encoded speech feature (T, D)
+        sos (int): Start of sequence id
+        eos (int): End of sequence id
+        beam_size (int): The number of hypotheses kept during search
+        vocab_size (int): The number of vocabulary
+        scorers (dict[str, ScorerInterface]): Dict of decoder modules
+            e.g., Decoder, CTCPrefixScorer, LM
+            The scorer will be ignored if it is `None`
+        weights (dict[str, float]): Dict of weights for each scorers
+            The scorer will be ignored if its weight is 0
+        token_list (list[str]): List of tokens for debug log
+        maxlenratio (float): Input length ratio to obtain max output length.
+            If maxlenratio=0.0 (default), it uses a end-detect function
+            to automatically find maximum hypothesis lengths
+        minlenratio (float): Input length ratio to obtain min output length.
+        pre_beam_score_key (str): key of scores to perform pre-beam search
+        pre_beam_ratio (float): beam size in the pre-beam search
+            will be `int(pre_beam_ratio * beam_size)`
+
+    Returns:
+        List[Dict]: N-best decoding results
+
+    """
+    ret = BeamSearch(
+        scorers,
+        weights,
+        beam_size=beam_size,
+        vocab_size=vocab_size,
+        pre_beam_ratio=pre_beam_ratio,
+        pre_beam_score_key=pre_beam_score_key,
+        sos=sos,
+        eos=eos,
+        token_list=token_list,
+    ).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
+    return [h.asdict() for h in ret]
--- a/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp
+++ b/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp
@ -53,7 +53,8 @@ std::string ctc_greedy_decoder(
    std::string best_path_result;
    for (size_t i = 0; i < idx_vec.size(); ++i) {
        if (idx_vec[i] != blank_id) {
-            best_path_result += vocabulary[idx_vec[i]];
+            std::string ch = vocabulary[idx_vec[i]];
+            best_path_result += (ch == kSPACE) ? tSPACE : ch;
        }
    }
    return best_path_result;
--- a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp
+++ b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp
@ -74,7 +74,8 @@ std::vector<std::pair<double, std::string>> get_beam_search_result(
        // convert index to string
        std::string output_str;
        for (size_t j = 0; j < output.size(); j++) {
-            output_str += vocabulary[output[j]];
+            std::string ch = vocabulary[output[j]];
+            output_str += (ch == kSPACE) ? tSPACE : ch;
        }
        std::pair<double, std::string> output_pair(
            -space_prefixes[i]->approx_ctc, output_str);
--- a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h
+++ b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h
@ -21,6 +21,7 @@
 #include "path_trie.h"

 const std::string kSPACE = "<space>";
+const std::string tSPACE = " ";
 const float NUM_FLT_INF = std::numeric_limits<float>::max();
 const float NUM_FLT_MIN = std::numeric_limits<float>::min();

--- a/deepspeech/decoders/scorers/init.py
+++ b/deepspeech/decoders/scorers/init.py
--- a/deepspeech/decoders/scorers/ctc.py
+++ b/deepspeech/decoders/scorers/ctc.py
@ -15,8 +15,8 @@
 import numpy as np
 import paddle

-from .ctc_prefix_score import CTCPrefixScore
-from .ctc_prefix_score import CTCPrefixScorePD
+from .ctc_prefix_score import CTCPrefixScorer
+from .ctc_prefix_score import CTCPrefixScorerPD
 from .scorer_interface import BatchPartialScorerInterface


--- a/deepspeech/decoders/scorers/ctc_prefix_score.py
+++ b/deepspeech/decoders/scorers/ctc_prefix_score.py
@ -6,7 +6,7 @@ import paddle
 import six


-class CTCPrefixScorePD():
+class CTCPrefixScorerPD():
    """Batch processing of CTCPrefixScore

    which is based on Algorithm 2 in WATANABE et al.
@ -267,7 +267,7 @@ class CTCPrefixScorePD():
            return (r_prev_new, s_prev, f_min_prev, f_max_prev)


-class CTCPrefixScore():
+class CTCPrefixScorer():
    """Compute CTC label sequence scores

    which is based on Algorithm 2 in WATANABE et al.
--- a/deepspeech/decoders/scorers/length_bonus.py
+++ b/deepspeech/decoders/scorers/length_bonus.py
--- a/deepspeech/decoders/scorers/ngram.py
+++ b/deepspeech/decoders/scorers/ngram.py
--- a/deepspeech/decoders/scorers/score_interface.py
+++ b/deepspeech/decoders/scorers/score_interface.py
@ -145,9 +145,11 @@ class PartialScorerInterface(ScorerInterface):
    and receives pre-pruned next tokens to score because it is too heavy to score
    all the tokens.

+    Score sub-set of tokens, not all.
+
    Examples:
         * Prefix search for connectionist-temporal-classification models
-             * :class:`espnet.nets.scorers.ctc.CTCPrefixScorer`
+             * :class:`decoders.scorers.ctc.CTCPrefixScorer`

    """

--- a/deepspeech/exps/deepspeech2/bin/export.py
+++ b/deepspeech/exps/deepspeech2/bin/export.py
@ -20,8 +20,9 @@ from deepspeech.utils.utility import print_arguments

 def main_sp(config, args):
    exp = Tester(config, args)
-    exp.setup()
-    exp.run_export()
+    with exp.eval():
+        exp.setup()
+        exp.run_export()


 def main(config, args):
--- a/deepspeech/exps/deepspeech2/bin/test.py
+++ b/deepspeech/exps/deepspeech2/bin/test.py
@ -20,8 +20,9 @@ from deepspeech.utils.utility import print_arguments

 def main_sp(config, args):
    exp = Tester(config, args)
-    exp.setup()
-    exp.run_test()
+    with exp.eval():
+        exp.setup()
+        exp.run_test()


 def main(config, args):
--- a/deepspeech/exps/deepspeech2/bin/test_export.py
+++ b/deepspeech/exps/deepspeech2/bin/test_export.py
@ -20,8 +20,9 @@ from deepspeech.utils.utility import print_arguments

 def main_sp(config, args):
    exp = ExportTester(config, args)
-    exp.setup()
-    exp.run_test()
+    with exp.eval():
+        exp.setup()
+        exp.run_test()


 def main(config, args):
--- a/deepspeech/exps/deepspeech2/bin/test_hub.py
+++ b/deepspeech/exps/deepspeech2/bin/test_hub.py
@ -56,11 +56,6 @@ class DeepSpeech2Tester_hub():
            cutoff_prob=cfg.cutoff_prob,
            cutoff_top_n=cfg.cutoff_top_n,
            num_processes=cfg.num_proc_bsearch)
-        #replace the '<space>' with ' '
-        result_transcripts = [
-            self._text_featurizer.detokenize(sentence)
-            for sentence in result_transcripts
-        ]

        return result_transcripts

--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -341,11 +341,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            cutoff_prob=cfg.cutoff_prob,
            cutoff_top_n=cfg.cutoff_top_n,
            num_processes=cfg.num_proc_bsearch)
-        #replace the <space> with ' '
-        result_transcripts = [
-            self._text_featurizer.detokenize(sentence)
-            for sentence in result_transcripts
-        ]

        self.autolog.times.stamp()
        self.autolog.times.stamp()
--- a/deepspeech/exps/u2/bin/alignment.py
+++ b/deepspeech/exps/u2/bin/alignment.py
@ -20,8 +20,9 @@ from deepspeech.utils.utility import print_arguments

 def main_sp(config, args):
    exp = Tester(config, args)
-    exp.setup()
-    exp.run_align()
+    with exp.eval():
+        exp.setup()
+        exp.run_align()


 def main(config, args):
--- a/deepspeech/exps/u2/bin/export.py
+++ b/deepspeech/exps/u2/bin/export.py
@ -20,8 +20,9 @@ from deepspeech.utils.utility import print_arguments

 def main_sp(config, args):
    exp = Tester(config, args)
-    exp.setup()
-    exp.run_export()
+    with exp.eval():
+        exp.setup()
+        exp.run_export()


 def main(config, args):
--- a/deepspeech/exps/u2/bin/test.py
+++ b/deepspeech/exps/u2/bin/test.py
@ -24,8 +24,9 @@ from deepspeech.utils.utility import print_arguments

 def main_sp(config, args):
    exp = Tester(config, args)
-    exp.setup()
-    exp.run_test()
+    with exp.eval():
+        exp.setup()
+        exp.run_test()


 def main(config, args):
--- a/deepspeech/exps/u2/bin/test_hub.py
+++ b/deepspeech/exps/u2/bin/test_hub.py
@ -0,0 +1,187 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+import os
+import sys
+
+import paddle
+import soundfile
+
+from deepspeech.exps.u2.config import get_cfg_defaults
+from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.models.u2 import U2Model
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.training.trainer import Trainer
+from deepspeech.utils import layer_tools
+from deepspeech.utils import mp_tools
+from deepspeech.utils.log import Log
+from deepspeech.utils.utility import print_arguments
+from deepspeech.utils.utility import UpdateConfig
+logger = Log(__name__).getlog()
+
+# TODO(hui zhang): dynamic load
+
+
+class U2Tester_Hub(Trainer):
+    def __init__(self, config, args):
+        # super().__init__(config, args)
+        self.args = args
+        self.config = config
+        self.audio_file = args.audio_file
+        self.collate_fn_test = SpeechCollator.from_config(config)
+        self._text_featurizer = TextFeaturizer(
+            unit_type=config.collator.unit_type,
+            vocab_filepath=None,
+            spm_model_prefix=config.collator.spm_model_prefix)
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config.model
+
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.collate_fn_test.feature_size
+            model_conf.output_dim = self.collate_fn_test.vocab_size
+
+        model = U2Model.from_config(model_conf)
+
+        if self.parallel:
+            model = paddle.DataParallel(model)
+
+        logger.info(f"{model}")
+        layer_tools.print_params(model, logger.info)
+
+        self.model = model
+        logger.info("Setup model")
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        self.model.eval()
+        cfg = self.config.decoding
+        audio_file = self.audio_file
+        collate_fn_test = self.collate_fn_test
+        audio, _ = collate_fn_test.process_utterance(
+            audio_file=audio_file, transcript="Hello")
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+        vocab_list = collate_fn_test.vocab_list
+
+        text_feature = self.collate_fn_test.text_feature
+        result_transcripts = self.model.decode(
+            audio,
+            audio_len,
+            text_feature=text_feature,
+            decoding_method=cfg.decoding_method,
+            lang_model_path=cfg.lang_model_path,
+            beam_alpha=cfg.alpha,
+            beam_beta=cfg.beta,
+            beam_size=cfg.beam_size,
+            cutoff_prob=cfg.cutoff_prob,
+            cutoff_top_n=cfg.cutoff_top_n,
+            num_processes=cfg.num_proc_bsearch,
+            ctc_weight=cfg.ctc_weight,
+            decoding_chunk_size=cfg.decoding_chunk_size,
+            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
+            simulate_streaming=cfg.simulate_streaming)
+        logger.info("The result_transcripts: " + result_transcripts[0][0])
+
+    def run_test(self):
+        self.resume()
+        try:
+            self.test()
+        except KeyboardInterrupt:
+            sys.exit(-1)
+
+    def setup(self):
+        """Setup the experiment.
+        """
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+
+        #self.setup_output_dir()
+        #self.setup_checkpointer()
+
+        #self.setup_dataloader()
+        self.setup_model()
+
+        self.iteration = 0
+        self.epoch = 0
+
+    def resume(self):
+        """Resume from the checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+        """
+        params_path = self.args.checkpoint_path + ".pdparams"
+        model_dict = paddle.load(params_path)
+        self.model.set_state_dict(model_dict)
+
+
+def check(audio_file):
+    logger.info("checking the audio file format......")
+    try:
+        sig, sample_rate = soundfile.read(audio_file)
+    except Exception as e:
+        logger.error(str(e))
+        logger.error(
+            "can not open the wav file, please check the audio file format")
+        sys.exit(-1)
+    logger.info("The sample rate is %d" % sample_rate)
+    assert (sample_rate == 16000)
+    logger.info("The audio file format is right")
+
+
+def main_sp(config, args):
+    exp = U2Tester_Hub(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    parser.add_argument(
+        "--audio_file", type=str, help="path of the input audio file")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    if not os.path.isfile(args.audio_file):
+        print("Please input the right audio file path")
+        sys.exit(-1)
+    check(args.audio_file)
+    # https://yaml.org/type/float.html
+    config = get_cfg_defaults()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -444,7 +444,7 @@ class U2Tester(U2Trainer):
        start_time = time.time()
        text_feature = self.test_loader.collate_fn.text_feature
        target_transcripts = self.ordid2token(texts, texts_len)
-        result_transcripts = self.model.decode(
+        result_transcripts, result_tokenids = self.model.decode(
            audio,
            audio_len,
            text_feature=text_feature,
@ -462,14 +462,19 @@ class U2Tester(U2Trainer):
            simulate_streaming=cfg.simulate_streaming)
        decode_time = time.time() - start_time

-        for utt, target, result in zip(utts, target_transcripts,
-                                       result_transcripts):
+        for utt, target, result, rec_tids in zip(
+                utts, target_transcripts, result_transcripts, result_tokenids):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write({"utt": utt, "ref": target, "hyp": result})
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
            logger.info(f"Utt: {utt}")
            logger.info(f"Ref: {target}")
            logger.info(f"Hyp: {result}")
--- a/deepspeech/exps/u2_kaldi/bin/test.py
+++ b/deepspeech/exps/u2_kaldi/bin/test.py
@ -29,14 +29,14 @@ model_test_alias = {
 def main_sp(config, args):
    class_obj = dynamic_import(args.model_name, model_test_alias)
    exp = class_obj(config, args)
-    exp.setup()
-
-    if args.run_mode == 'test':
-        exp.run_test()
-    elif args.run_mode == 'export':
-        exp.run_export()
-    elif args.run_mode == 'align':
-        exp.run_align()
+    with exp.eval():
+        exp.setup()
+        if args.run_mode == 'test':
+            exp.run_test()
+        elif args.run_mode == 'export':
+            exp.run_export()
+        elif args.run_mode == 'align':
+            exp.run_align()


 def main(config, args):
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@ -390,6 +390,10 @@ class U2Tester(U2Trainer):

    def __init__(self, config, args):
        super().__init__(config, args)
+        self.text_feature = TextFeaturizer(
+            unit_type=self.config.collator.unit_type,
+            vocab_filepath=self.config.collator.vocab_filepath,
+            spm_model_prefix=self.config.collator.spm_model_prefix)

    def id2token(self, texts, texts_len, text_feature):
        """ ord() id to chr() chr """
@ -413,15 +417,11 @@ class U2Tester(U2Trainer):
        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer

        start_time = time.time()
-        text_feature = TextFeaturizer(
-            unit_type=self.config.collator.unit_type,
-            vocab_filepath=self.config.collator.vocab_filepath,
-            spm_model_prefix=self.config.collator.spm_model_prefix)
-        target_transcripts = self.id2token(texts, texts_len, text_feature)
-        result_transcripts = self.model.decode(
+        target_transcripts = self.id2token(texts, texts_len, self.text_feature)
+        result_transcripts, result_tokenids = self.model.decode(
            audio,
            audio_len,
-            text_feature=text_feature,
+            text_feature=self.text_feature,
            decoding_method=cfg.decoding_method,
            lang_model_path=cfg.lang_model_path,
            beam_alpha=cfg.alpha,
@ -436,14 +436,19 @@ class U2Tester(U2Trainer):
            simulate_streaming=cfg.simulate_streaming)
        decode_time = time.time() - start_time

-        for utt, target, result in zip(utts, target_transcripts,
-                                       result_transcripts):
+        for i, (utt, target, result, rec_tids) in enumerate(zip(
+                utts, target_transcripts, result_transcripts, result_tokenids)):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write({"utt": utt, "ref": target, "hyp": result})
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
            logger.info(f"Utt: {utt}")
            logger.info(f"Ref: {target}")
            logger.info(f"Hyp: {result}")
--- a/deepspeech/exps/u2_st/bin/export.py
+++ b/deepspeech/exps/u2_st/bin/export.py
@ -20,8 +20,9 @@ from deepspeech.utils.utility import print_arguments

 def main_sp(config, args):
    exp = Tester(config, args)
-    exp.setup()
-    exp.run_export()
+    with exp.eval():
+        exp.setup()
+        exp.run_export()


 def main(config, args):
--- a/deepspeech/exps/u2_st/bin/test.py
+++ b/deepspeech/exps/u2_st/bin/test.py
@ -24,8 +24,9 @@ from deepspeech.utils.utility import print_arguments

 def main_sp(config, args):
    exp = Tester(config, args)
-    exp.setup()
-    exp.run_test()
+    with exp.eval():
+        exp.setup()
+        exp.run_test()


 def main(config, args):
--- a/deepspeech/io/batchfy.py
+++ b/deepspeech/io/batchfy.py
@ -354,7 +354,7 @@ def make_batchset(
    :param int batch_frames_out: maximum number of output frames in a minibatch.
    :param int batch_frames_out: maximum number of input+output frames in a minibatch.
    :param str count: strategy to count maximum size of batch.
-        For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES
+        For choices, see io.batchfy.BATCH_COUNT_CHOICES

    :param int max_length_in: maximum length of input to decide adaptive batch size
    :param int max_length_out: maximum length of output to decide adaptive batch size
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -32,7 +32,7 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
 logger = Log(__name__).getlog()


-def tokenids(text, keep_transcription_text):
+def _tokenids(text, keep_transcription_text):
    # for training text is token ids 
    tokens = text  # token ids

@ -93,6 +93,8 @@ class SpeechCollatorBase():
        a user-defined shape) within one batch.
        """
        self.keep_transcription_text = keep_transcription_text
+        self.train_mode = not keep_transcription_text
+
        self.stride_ms = stride_ms
        self.window_ms = window_ms
        self.feat_dim = feat_dim
@ -192,6 +194,7 @@ class SpeechCollatorBase():
        texts = []
        text_lens = []
        utts = []
+        tids = []  # tokenids

        for idx, item in enumerate(batch):
            utts.append(item['utt'])
@ -203,7 +206,7 @@ class SpeechCollatorBase():
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])

-            tokens = tokenids(text, self.keep_transcription_text)
+            tokens = _tokenids(text, self.keep_transcription_text)
            texts.append(tokens)
            text_lens.append(tokens.shape[0])

@ -351,7 +354,7 @@ class TripletSpeechCollator(SpeechCollator):

            tokens = [[], []]
            for idx, text in enumerate([translation, transcription]):
-                tokens[idx] = tokenids(text, self.keep_transcription_text)
+                tokens[idx] = _tokenids(text, self.keep_transcription_text)

            translation_text.append(tokens[0])
            translation_text_lens.append(tokens[0].shape[0])
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
@ -142,6 +142,15 @@ class BatchDataLoader():
            collate_fn=batch_collate,
            num_workers=self.n_iter_processes, )

+    def __len__(self):
+        return len(self.dataloader)
+
+    def __iter__(self):
+        return self.dataloader.__iter__()
+
+    def __call__(self):
+        return self.__iter__()
+
    def __repr__(self):
        echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
        echo += f"train_mode: {self.train_mode}, "
@ -159,12 +168,3 @@ class BatchDataLoader():
        echo += f"num_workers: {self.n_iter_processes}, "
        echo += f"file: {self.json_file}"
        return echo
-
-    def __len__(self):
-        return len(self.dataloader)
-
-    def __iter__(self):
-        return self.dataloader.__iter__()
-
-    def __call__(self):
-        return self.__iter__()
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@ -809,7 +809,8 @@ class U2BaseModel(nn.Layer):
            raise ValueError(f"Not support decoding method: {decoding_method}")

        res = [text_feature.defeaturize(hyp) for hyp in hyps]
-        return res
+        res_tokenids = [hyp for hyp in hyps]
+        return res, res_tokenids


 class U2Model(U2BaseModel):
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
@ -42,7 +42,7 @@ def all_version():
        "paddle_commit": paddle.version.commit,
        "soundfile": soundfile.__version__,
    }
-    logger.info(f"Deps Module Version:{pformat(vers.items())}")
+    logger.info(f"Deps Module Version:{pformat(list(vers.items()))}")


@contextmanager
--- a/docs/images/audio_icon.png
+++ b/docs/images/audio_icon.png
--- a/docs/images/fastpitch.png
+++ b/docs/images/fastpitch.png
--- a/docs/images/fastspeech.png
+++ b/docs/images/fastspeech.png
--- a/docs/images/fastspeech2.png
+++ b/docs/images/fastspeech2.png
--- a/docs/images/frame_level_am.png
+++ b/docs/images/frame_level_am.png
--- a/docs/images/logo-small.png
+++ b/docs/images/logo-small.png
--- a/docs/images/logo.png
+++ b/docs/images/logo.png
--- a/docs/images/news_icon.png
+++ b/docs/images/news_icon.png
--- a/docs/images/pwg.png
+++ b/docs/images/pwg.png
--- a/docs/images/seq2seq_am.png
+++ b/docs/images/seq2seq_am.png
--- a/docs/images/speedyspeech.png
+++ b/docs/images/speedyspeech.png
--- a/docs/images/tacotron.png
+++ b/docs/images/tacotron.png
--- a/docs/images/tacotron2.png
+++ b/docs/images/tacotron2.png
--- a/docs/images/transformer.png
+++ b/docs/images/transformer.png
--- a/docs/images/transformer_tts.png
+++ b/docs/images/transformer_tts.png
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,7 +0,0 @@
-myst_parser
-numpydoc
-recommonmark>=0.5.0
-sphinx
-sphinx-autobuild
-sphinx-markdown-tables
-sphinx_rtd_theme
--- a/docs/source/asr/augmentation.md
+++ b/docs/source/asr/augmentation.md
--- a/docs/source/asr/data_preparation.md
+++ b/docs/source/asr/data_preparation.md
--- a/docs/source/asr/deepspeech_architecture.md
+++ b/docs/source/asr/deepspeech_architecture.md
@ -14,10 +14,11 @@ In addition, the training process and the testing process are also introduced.
 The arcitecture of the model is shown in Fig.1.

 <p align="center">
-<img src="../images/ds2onlineModel.png" width=800>
+<img src="../../images/ds2onlineModel.png" width=800>
 <br/>Fig.1 The Arcitecture of deepspeech2 online model
 </p>

+
 ### Data Preparation
 #### Vocabulary
 For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>.  For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
@ -130,7 +131,7 @@ By using the command above, the training process can be started. There are 5 sta
 Using the command below, you can test the deepspeech2 online model.
 ```
 bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
-```
+ ```
 The detail commands are:
 ```
 conf_path=conf/deepspeech2_online.yaml
@ -152,7 +153,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test export ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
- ```
+```
 After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.


@ -161,12 +162,13 @@ The deepspeech2 offline model is similarity to the deepspeech2 online model. The

 The arcitecture of the model is shown in Fig.2.
 <p align="center">
-<img src="../images/ds2offlineModel.png" width=800>
+<img src="../../images/ds2offlineModel.png" width=800>
 <br/>Fig.2 The Arcitecture of deepspeech2 offline model
 </p>



+
 For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.

 The code of encoder and decoder for deepspeech2 offline model is in:
@ -182,7 +184,7 @@ For training and testing, the "model_type" and the "conf_path" must be set.
 # Training offline
 cd examples/aishell/s0
 bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml
-```
+ ```
 ```
 # Testing offline
 cd examples/aishell/s0
--- a/docs/source/asr/feature_list.md
+++ b/docs/source/asr/feature_list.md
--- a/docs/source/asr/getting_started.md
+++ b/docs/source/asr/getting_started.md
--- a/docs/source/asr/install.md
+++ b/docs/source/asr/install.md
--- a/docs/source/asr/ngram_lm.md
+++ b/docs/source/asr/ngram_lm.md
--- a/docs/source/asr/reference.md
+++ b/docs/source/asr/reference.md
--- a/docs/source/asr/released_model.md
+++ b/docs/source/asr/released_model.md
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -25,7 +25,7 @@ import sphinx_rtd_theme

 # -- Project information -----------------------------------------------------

-project = 'paddle deepspeech'
+project = 'paddle speech'
 copyright = '2021, Deepspeech-developers'
 author = 'Deepspeech-developers'

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -1,7 +1,7 @@
 Welcome to paddle Deepspeech documentation !
 ==============================================

-**Deepspeech** is a ASR library implemented by paddlepaddle.
+**Deepspeech** is a Speech toolkits implemented by paddlepaddle.


 Contents
@ -11,37 +11,37 @@ Contents
   :maxdepth: 1
   :caption: Introduction
   
-   deepspeech_architecture 
+   asr/deepspeech_architecture 


 .. toctree::
   :maxdepth: 1
   :caption: Getting_started

-   install
-   getting_started
+   asr/install
+   asr/getting_started


 .. toctree::
   :maxdepth: 1
   :caption: More Information

-   data_preparation
-   augmentation
-   feature_list
-   ngram_lm  
+   asr/data_preparation
+   asr/augmentation
+   asr/feature_list
+   asr/ngram_lm  


 .. toctree::
   :maxdepth: 1
   :caption: Released_model

-   released_model
+   asr/released_model


 .. toctree::
   :maxdepth: 1
   :caption: Acknowledgement

-   reference
+   asr/reference

--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@ -0,0 +1,130 @@
+# Parakeet
+Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle dynamic graph and includes many influential TTS models.  
+
+<div align="center">
+  <img src="../../images/logo.png" width=300 /> <br>
+</div>
+
+
+## News  <img src="../../images/news_icon.png" width="40"/>
+- Oct-12-2021, Refector examples code.
+- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
+- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
+- Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
+- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
+- Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
+- Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
+- Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
+- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
+- Jul-01-2021, Montreal-Forced-Aligner. Check  [examples/use_mfa](./examples/use_mfa).
+- May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
+
+## Overview
+
+In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
+
+- Text FrontEnd
+  - Rule based Chinese frontend.
+
+- Acoustic Models
+  - [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558)
+  - [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802)
+  - [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
+  - [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
+- Vocoders
+  - [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
+  - [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
+- Voice Cloning
+  - [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
+  - [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
+
+## Setup
+It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
+
+Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
+
+```bash
+sudo apt-get install libsndfile1
+```
+### Install PaddlePaddle
+See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
+
+### Install Parakeet
+
+```bash
+git clone https://github.com/PaddlePaddle/Parakeet
+cd Parakeet
+pip install -e .
+```
+
+If some python dependent packages cannot be installed successfully, you can run the following script first.
+(replace `python3.6` with your own python version)
+```bash
+sudo apt install -y python3.6-dev
+```
+
+See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
+
+## Examples
+Entries to the introduction, and the launch of training and synthsis for different example models:
+
+- [>>> Chinese Text Frontend](./examples/text_frontend)
+- [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
+- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
+- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
+- [>>> SpeedySpeech](./examples/speedyspeech)
+- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
+- [>>> GE2E](./examples/ge2e)
+- [>>> WaveFlow](./examples/waveflow)
+- [>>> TransformerTTS](./examples/transformer_tts)
+- [>>> Tacotron2](./examples/tacotron2)
+
+## Audio samples
+### TTS models (Acoustic Model + Neural Vocoder)
+Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
+
+## Released Model
+
+### Acoustic Model
+
+#### FastSpeech2/FastPitch
+1. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
+2. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+3. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
+
+#### SpeedySpeech
+1. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
+
+#### TransformerTTS
+
+1. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
+
+#### Tacotron2
+
+1. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
+2. [tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
+
+### Vocoder
+
+#### WaveFlow
+
+1. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
+
+#### Parallel WaveGAN
+
+1. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
+2. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
+
+### Voice Cloning
+
+#### Tacotron2_AISHELL3
+
+1. [tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
+
+#### GE2E
+
+1. [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
+
+## License
+
+Parakeet is provided under the [Apache-2.0 license](LICENSE).
--- a/docs/source/tts/advanced_usage.md
+++ b/docs/source/tts/advanced_usage.md
@ -0,0 +1,333 @@
+
+# Advanced Usage
+This sections covers how to extend parakeet by implementing your own models and experiments. Guidelines on implementation are also elaborated.
+
+For the general deep learning experiment, there are several parts to deal with:
+1. Preprocess the data according to the needs of the model, and iterate the dataset by batch.
+2. Define the model, optimizer and other components.
+3. Write out the training process (generally including forward / backward calculation, parameter update, log recording, visualization, periodic evaluation, etc.).
+5. Configure and run the experiment.
+
+## Parakeet's Model Components
+In order to balance the reusability and function of models, we divide models into several types according to its characteristics.
+
+For the commonly used modules that can be used as part of other larger models, we try to implement them as simple and universal as possible, because they will be reused. Modules with trainable parameters are generally implemented as subclasses of `paddle.nn.Layer`. Modules without trainable parameters can be directly implemented as a function, and its input and output are `paddle.Tensor`.
+
+Models for a specific task  are implemented as subclasses of `paddle.nn.Layer`. Models could be simple, like a single layer RNN. For complicated models, it is recommended to split the model into different components.
+
+For a seq-to-seq model, it's natural to split it into encoder and decoder. For a model composed of several similar layers, it's natural to extract the sublayer as a separate layer.
+
+There are two common ways to define a model which consists of several modules.
+
+1. Define a module given the specifications. Here is an example with multilayer perceptron.
+    ```python
+    class MLP(nn.Layer):
+        def __init__(self, input_size, hidden_size, output_size):
+            self.linear1 = nn.Linear(input_size, hidden_size)
+            self.linear2 = nn.Linear(hidden_size, output_size)
+
+        def forward(self, x):
+            return self.linear2(paddle.tanh(self.linear1(x))
+
+    module = MLP(16, 32, 4) # intialize a module
+    ```
+    When the module is intended to be a generic and reusable layer that can be integrated into a larger model, we prefer to define it in this way.
+
+    For considerations of readability and usability, we strongly recommend **NOT** to pack specifications into a single object. Here’s an example below.
+    ```python
+    class MLP(nn.Layer):
+        def __init__(self, hparams):
+            self.linear1 = nn.Linear(hparams.input_size, hparams.hidden_size)
+            self.linear2 = nn.Linear(hparams.hidden_size, hparams.output_size)
+
+        def forward(self, x):
+            return self.linear2(paddle.tanh(self.linear1(x))
+    ```
+    For a module defined in this way, it’s harder for the user to initialize an instance. Users have to read the code to check what attributes are used.
+
+    Also, code in this style tend to be abused by passing a huge config object to initialize every module used in an experiment, thought each module may not need the whole configuration.
+
+    We prefer to be explicit.
+
+2. Define a module as a combination given its components. Here is an example for a sequence-to-sequence model.
+    ```python
+    class Seq2Seq(nn.Layer):
+        def __init__(self, encoder, decoder):
+            self.encoder = encoder
+            self.decoder = decoder
+
+        def forward(self, x):
+            encoder_output = self.encoder(x)
+            output = self.decoder(encoder_output)
+            return output
+
+    encoder = Encoder(...)
+    decoder = Decoder(...)
+    # compose two components
+    model = Seq2Seq(encoder, decoder)
+    ```
+    When a model is a complicated and made up of several components, each of which has a separate functionality, and can be replaced by other components with the same functionality, we prefer to define it in this way.
+
+In the directory structure of Parakeet, modules with high reusability are placed in `parakeet.modules`, but models for specific tasks are placed in `parakeet.models`. When developing a new model, developers need to consider the feasibility of splitting the modules, and the degree of generality of the modules, and place them in appropriate directories.
+
+## Parakeet's Data Components
+Another critical componnet for a deep learning project is data.
+Parakeet uses the following methods for training data:
+1. Preprocess the data.
+2. Load the preprocessed data for training.
+
+Previously, we wrote the preprocessing in the `__getitem__` of the Dataset, which will process when accessing a certain batch samples, but encountered some problems:
+
+1.  Efficiency problem. Even if Paddle has a design to load data asynchronously, when the batch size is large, each sample needs to be preprocessed and set up batches, which takes a lot of time , and  may even seriously slow down the training process.
+2. Data filtering problem. Some filtering conditions depend on the features of the processed sample. For example, filtering samples that are too short according to text length. If the text  length can only be known after `__getitem__`,  every time you filter, the entire dataset needed to be loaded once!  In addition, if you do not pre-filter, A small exception (such as too short text ) in  `__getitem__` will cause an exception in the entire data flow, which is not feasible, because `collate_fn `  presupposes that the acquisition of each sample can be normal. Even if  some special flags, such as `None`, are used to mark data acquisition failures, and skip `collate_fn`, it will change batch_size .
+
+Therefore, it is not realistic to put preprocessing entirely on `__getitem__`. We use the method mentioned above instead.
+During preprocessing, we can do filtering, We can also save more intermediate features, such as text length, audio length, etc., which can be used for subsequent filtering. Because of the habit of TTS field, data is  stored in multiple files, and the processed results are stored in `npy` format.
+
+Use a list-like way to store metadata and store the file path in it, so that you can not be restricted by the specific storage location of the file. In addition to the file path, other metadata can also be stored in it. For example, the path of the text, the path of the audio, the path of the spectrum, the number of frames, the number of sampling points, and so on.
+
+Then for the path, there are multiple opening methods,  such as `sf.read`, `np.load`, etc., so it's best to use a parameter that can be input, we don't even want to determine the reading method by it's extension, it's best to let the users input it , in this way, users can define their own method to parse the data.
+
+So we learned from the design of `DataFrame`, but our construction method is simpler, only need a `list of dicts`, a dict represents a record, and it's convenient to interact with formats such as `json`, `yaml`. For each selected field, we need to give a parser (called `converter` in the interface), and that's it.
+
+Then we need to select a format for saving metadata to the hard disk. There are two square brackets when storing the list of records in `json`, which is not convenient for stream reading and writing, so we use `jsonlines`. We don't use `yaml` because it occupies too many rows when storing the list of records.
+
+Meanwhile, `cache` is added here, and a multi-process Manager is used to share memory between multiple processes. When `num_workers` is used, it is guaranteed that each sub process will not cache a copy.
+
+The implementation of `DataTable` can be found in `parakeet/datasets/data_table.py`.
+```python
+class DataTable(Dataset):
+    """Dataset to load and convert data for general purpose.
+
+    Parameters
+    ----------
+    data : List[Dict[str, Any]]
+        Metadata, a list of meta datum, each of which is composed of
+        several fields
+    fields : List[str], optional
+        Fields to use, if not specified, all the fields in the data are
+        used, by default None
+    converters : Dict[str, Callable], optional
+        Converters used to process each field, by default None
+    use_cache : bool, optional
+        Whether to use cache, by default False
+
+    Raises
+    ------
+    ValueError
+        If there is some field that does not exist in data.
+    ValueError
+        If there is some field in converters that does not exist in fields.
+    """
+
+    def __init__(self,
+                 data: List[Dict[str, Any]],
+                 fields: List[str]=None,
+                 converters: Dict[str, Callable]=None,
+                 use_cache: bool=False):
+```
+It's `__getitem__` method is to parse each field with their own parser, and then compose a dictionary to return.
+```python
+def _convert(self, meta_datum: Dict[str, Any]) -> Dict[str, Any]:
+    """Convert a meta datum to an example by applying the corresponding
+    converters to each fields requested.
+
+    Parameters
+    ----------
+    meta_datum : Dict[str, Any]
+        Meta datum
+
+    Returns
+    -------
+    Dict[str, Any]
+        Converted example
+    """
+    example = {}
+    for field in self.fields:
+        converter = self.converters.get(field, None)
+        meta_datum_field = meta_datum[field]
+        if converter is not None:
+            converted_field = converter(meta_datum_field)
+        else:
+            converted_field = meta_datum_field
+        example[field] = converted_field
+    return example
+```
+
+## Parakeet's Training Components
+A typical training process includes the following processes:
+1. Iterate the dataset.
+2. Process batch data.
+3. Neural network forward/backward calculation.
+4. Parameter update.
+5. Evaluate the model on the validation dataset, when some special conditions are reached.
+6. Write logs, visualize, and in some cases save necessary intermediate results.
+7. Save the state of the model and optimizer.
+
+Here, we mainly introduce the training related components of Parakeet and why we designed it like this.
+### Global Repoter
+When training and modifying Deep Learning models，logging is often needed, and it has even become the key to model debugging and modifying. We usually use various visualization tools，such as ,  `visualdl` in `paddle`, `tensorboard` in `tensorflow`  and `vidsom`, `wnb` ,etc. Besides, `logging` and `print` are usuaally used for different purpose.
+
+In these tools, `print` is the simplest，it doesn't have the concept of  `logger` and `handler` in `logging` 、 `summarywriter`  and `logdir` in `tensorboard`, when printing, there is no need for `global_step` ，It's light enough to appear anywhere in the code, and it's printed to a common stdout. Of course, its customizability is  limited, for example, it is no longer intuitive when printing dictionaries or more complex objects. And it's fleeting, people need to use redirection to save information.
+
+For TTS models development，we hope to have a more universal multimedia stdout, which is actually a tool similar to `tensorboard`, which allows many multimedia forms, but it needs a `summary writer` when using, and a `step` when writing information. If the data are images or voices,  some format control parameters are needed.
+
+This will destroy the modular design to a certain extent. For example, If my model is composed of multiple sublayers, and I want to record some important information in the forward method of some sublayers. For this reason, I may need to pass the `summary writer` to this sublayers, but for the sublayers, its function is calculation, it should not have extra considerations, and it's also difficult for us to tolerate that the initialization of an `nn.Linear` has an optional `visualizer` in the method. And, for a calculation module, **HOW** can it know the global step? These are things related to the training process!
+
+Therefore, a more common approach is not to put writing_log_code in the definition of layer, but return it, then obtain them during training, and write them to `summary writer`.  However, the return values need to be modified.  `summary writer ` is a broadcaster at the training level, and then each module transmits information to it by modifying the return values.
+
+We think this method is a little ugly. We prefer to return the necessary information only rather than change the return values to accommodate visualization and recording.  When you need to report some information, you should be able to report it without difficult. So we imitate the design of `chainer` and use the `global repoter`.
+
+It takes advantage of the globality of Python's module level variables and the effect of context manager.
+
+There is a module level variable in  `parakeet/training/reporter.py`  `OBSERVATIONS`，which is  a `Dict` to store key-value.
+```python
+# parakeet/training/reporter.py
+
+@contextlib.contextmanager
+def scope(observations):
+    # make `observation` the target to report to.
+    # it is basically a dictionary that stores temporary observations
+    global OBSERVATIONS
+    old = OBSERVATIONS
+    OBSERVATIONS = observations
+
+    try:
+        yield
+    finally:
+        OBSERVATIONS = old
+```
+
+Then we implement a context manager `scope`, which is used to switch the variables bound by the name of `OBSERVATIONS`. Then a `getter` function is defined to get the dictionary bound by `OBSERVATIONS`.
+```python
+def get_observations():
+    global OBSERVATIONS
+    return OBSERVATIONS
+```
+Then we define a function to get  the current `OBSERVATIONS`，and write key-value pair into it.
+```python
+def report(name, value):
+    # a simple function to report named value
+    # you can use it everywhere, it will get the default target and writ to it
+    # you can think of it as std.out
+    observations = get_observations()
+    if observations is None:
+        return
+    else:
+        observations[name] = value
+```
+The test code following shows the usage method.
+- use `first` as the current `OBSERVATION`, write `first_begin=1`,
+- then, open the second `OBSERVATION`, write `second_begin=2`,
+- then, open the third `OBSERVATION`, write  `third_begin=3`
+- exit the third `OBSERVATION` , we back to the second  `OBSERVATION` automatically
+- write some context in the second `OBSERVATION` , then exit it, and   we back to the first  `OBSERVATION` automatically
+```python
+def test_reporter_scope():
+    first = {}
+    second = {}
+    third = {}
+
+    with scope(first):
+        report("first_begin", 1)
+        with scope(second):
+            report("second_begin", 2)
+            with scope(third):
+                report("third_begin", 3)
+                report("third_end", 4)
+            report("seconf_end", 5)
+        report("first_end", 6)
+
+    assert first == {'first_begin': 1, 'first_end': 6}
+    assert second == {'second_begin': 2, 'seconf_end': 5}
+    assert third == {'third_begin': 3, 'third_end': 4}
+```
+
+In this way, when we write  modular components, we can directly call `report`.  The caller will decide where to report as long as it's ready for `OBSERVATION`, then it opens a `scope` and calls the component within this `scope`.
+
+ The `Trainer` in Parakeet report the information in this way.
+```python
+while True:
+    self.observation = {}
+    # set observation as the report target
+    # you can use report freely in Updater.update()
+
+    # updating parameters and state
+    with scope(self.observation):
+        update() # training for a step is defined here
+```
+### Updater: Model Training Process
+
+In order to maintain the purity of function and the reusability of code, we abstract the model code into a subclass of  `paddle.nn.Layer`, and write the core computing functions in it.
+
+We tend to write the forward process of training in `forward()`, but only write to the prediction result, not to the loss. Therefore, this module can be called by a larger module.
+
+However, when we compose an experiment, we need to add some other things, such as training process, evaluation process, checkpoint saving, visualization and the like. In this process, we will encounter some things that only exist in the training process, such as `optimizer`, `learning rate scheduler`, `visualizer`, etc. These things are not part of the model, they should **NOT** be written in the model code.
+
+We made an abstraction for these intermediate processes, that is, `Updater`, which takes the `model`, `optimizer`, and `data stream` as input, and its function is training. Since there may be differences in training methods of different models, we tend to write a corresponding `Updater` for each model. But this is different from the final training script, there is still a certain degree of encapsulation, just to extract the details of regular saving, visualization, evaluation, etc., and only retain the most basic function, that is,  training the model.
+
+### Visualizer
+Because we choose observation as the communication mode, we can simply write the things in observation into `visualizer`.
+
+## Parakeet's Configuration Components
+Deep learning experiments often have many options to configure. These configurations can be roughly divided into several categories.
+1. Data source and data processing mode configuration.
+2. Save path configuration of experimental results.
+3. Data preprocessing mode configuration.
+4. Model structure and hyperparameterconfiguration.
+5. Training process configuration.
+
+It’s common to change the running configuration to compare results. To keep track of running configuration, we use `yaml` configuration files.
+
+Also, we want to interact with command line options. Some options that usually change according to running environments is provided by command line arguments. In addition, we want to override an option in the config file without editing it.
+
+Taking these requirements in to consideration, we use [yacs](https://github.com/rbgirshick/yacs) as a config management tool. Other tools like [omegaconf](https://github.com/omry/omegaconf) are also powerful and have similar functions.
+
+In each example provided, there is a `config.py`,  the default config is defined at `conf/default.yaml`. If you want to get the default config, import `config.py` and call `get_cfg_defaults()` to get it. Then it can be updated with `yaml` config file or command line arguments if needed.
+
+For details about how to use yacs in experiments, see [yacs](https://github.com/rbgirshick/yacs).
+
+The following is the basic  `ArgumentParser`:
+1. `--config`  is used to support configuration file parsing, and the configuration file itself handles the unique options of each experiment.
+2. `--train-metadata` is the path to the training data.
+3.  `--output-dir` is the dir to save the training results.（if there are checkpoints in  `checkpoints/` of  `--output-dir` , it's defalut to reload the newest checkpoint to train)
+4. `--device` and  `--nprocs` determine operation modes，`--device` specifies the type of running device, whether to run on `cpu` or `gpu`. `--nprocs` refers to  the number of training processes. If `nprocs` > 1, it means that multi process parallel training is used. (Note: currently only GPU multi card multi process training is supported.)
+
+Developers can refer to the examples in  `Parakeet/examples` to write the default configuration file when adding new experiments.
+
+## Parakeet's Experiment template
+
+The experimental codes in Parakeet  are generally organized as follows:
+
+```text
+├── conf
+│    └── default.yaml   (defalut config)
+├── README.md           (help information)  
+├── batch_fn.py         (organize metadata into batch)
+├── config.py           (code to read default config)
+├── *_updater.py        (Updater of  a specific model)
+├── preprocess.py       (data preprocessing code)
+├── preprocess.sh       (script to call data preprocessing.py)
+├── synthesis.py        (synthesis from metadata)
+├── synthesis.sh        (script to call synthesis.py)
+├── synthesis_e2e.py    (synthesis from raw text)
+├── synthesis_e2e.sh    (script to call synthesis_e2e.py)
+├── train.py            (train code)
+└── run.sh              (script to call train.py)
+```
+
+We add a named argument. `--output-dir` to each training script to specify the output directory. The directory structure is as follows, It's best for developers to follow this specification:
+```text
+exp/default/
+├── checkpoints/
+│   ├── records.jsonl        (record file)
+│   └── snapshot_iter_*.pdz  (checkpoint files)
+├── config.yaml              (config fille of this experiment)
+├── vdlrecords.*.log         (visualdl record file)
+├── worker_*.log             (text logging, one file per process)
+├── validation/              (output dir during training, information_iter_*/ is the output of each step, if necessary)
+├── inference/               (output dir of exported static graph model, which is only used in the final stage of training, if implemented)
+└── test/                    (output dir of synthesis results)
+```
+
+You can view the examples we provide in `Parakeet/examples`. These experiments are provided to users as examples which can be run directly. Users are welcome to add new models and experiments and contribute code to Parakeet.
--- a/docs/source/tts/basic_usage.md
+++ b/docs/source/tts/basic_usage.md
@ -0,0 +1,115 @@
+# Basic Usage
+This section shows how to use pretrained models provided by parakeet and make inference with them.
+
+Pretrained models in v0.4 are provided in a archive. Extract it to get a folder like this:
+```
+checkpoint_name/
+├──default.yaml
+├──snapshot_iter_76000.pdz
+├──speech_stats.npy
+└──phone_id_map.txt
+```
+`default.yaml` stores the config used to train the model.
+`snapshot_iter_N.pdz` is the chechpoint file, where `N` is the steps it has been trained.
+`*_stats.npy` is the stats file of feature if  it has been normalized before training.
+`phone_id_map.txt` is the map of  phonemes to phoneme_ids.
+
+The example code below shows how to use the models for prediction.
+## Acoustic Models (text to spectrogram)
+The code below show how to use a `FastSpeech2` model.  After loading the pretrained model, use it and normalizer object to construct a prediction object，then use fastspeech2_inferencet(phone_ids) to generate spectrograms, which can be further used to synthesize raw audio with a vocoder.
+
+```python
+from pathlib import Path
+import numpy as np
+import paddle
+import yaml
+from yacs.config import CfgNode
+from parakeet.models.fastspeech2 import FastSpeech2
+from parakeet.models.fastspeech2 import FastSpeech2Inference
+from parakeet.modules.normalizer import ZScore
+# Parakeet/examples/fastspeech2/baker/frontend.py
+from frontend import Frontend
+
+# load the pretrained model
+checkpoint_dir = Path("fastspeech2_nosil_baker_ckpt_0.4")
+with open(checkpoint_dir / "phone_id_map.txt", "r") as f:
+    phn_id = [line.strip().split() for line in f.readlines()]
+vocab_size = len(phn_id)
+with open(checkpoint_dir / "default.yaml") as f:
+    fastspeech2_config = CfgNode(yaml.safe_load(f))
+odim = fastspeech2_config.n_mels
+model = FastSpeech2(
+    idim=vocab_size, odim=odim, **fastspeech2_config["model"])
+model.set_state_dict(
+    paddle.load(args.fastspeech2_checkpoint)["main_params"])
+model.eval()
+
+# load stats file
+stat = np.load(checkpoint_dir / "speech_stats.npy")
+mu, std = stat
+mu = paddle.to_tensor(mu)
+std = paddle.to_tensor(std)
+fastspeech2_normalizer = ZScore(mu, std)
+
+# construct a prediction object
+fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
+
+# load Chinese Frontend
+frontend = Frontend(checkpoint_dir / "phone_id_map.txt")
+
+# text to spectrogram
+sentence = "你好吗？"
+input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
+phone_ids = input_ids["phone_ids"]
+flags = 0
+# The output of Chinese text frontend is segmented
+for part_phone_ids in phone_ids:
+    with paddle.no_grad():
+        temp_mel = fastspeech2_inference(part_phone_ids)
+        if flags == 0:
+            mel = temp_mel
+            flags = 1
+        else:
+            mel = paddle.concat([mel, temp_mel])
+```
+
+## Vocoder (spectrogram to wave)
+The code below show how to use a  ` Parallel WaveGAN` model. Like the example above, after loading the pretrained model, use it and normalizer object to construct a prediction object，then use pwg_inference(mel) to generate  raw audio (in wav format).
+
+```python
+from pathlib import Path
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+from parakeet.models.parallel_wavegan import PWGGenerator
+from parakeet.models.parallel_wavegan import PWGInference
+from parakeet.modules.normalizer import ZScore
+
+# load the pretrained model
+checkpoint_dir = Path("parallel_wavegan_baker_ckpt_0.4")
+with open(checkpoint_dir / "pwg_default.yaml") as f:
+    pwg_config = CfgNode(yaml.safe_load(f))
+vocoder = PWGGenerator(**pwg_config["generator_params"])
+vocoder.set_state_dict(paddle.load(args.pwg_params))
+vocoder.remove_weight_norm()
+vocoder.eval()
+
+# load stats file
+stat = np.load(checkpoint_dir / "pwg_stats.npy")
+mu, std = stat
+mu = paddle.to_tensor(mu)
+std = paddle.to_tensor(std)
+pwg_normalizer = ZScore(mu, std)
+
+# construct a prediction object
+pwg_inference = PWGInference(pwg_normalizer, vocoder)
+
+# spectrogram to wave
+wav = pwg_inference(mel)
+sf.write(
+        audio_path,
+        wav.numpy(),
+        samplerate=fastspeech2_config.fs)
+```
--- a/docs/source/tts/cn_text_frontend.md
+++ b/docs/source/tts/cn_text_frontend.md
@ -0,0 +1,108 @@
+# Chinese Rule Based Text Frontend
+TTS system mainly includes three modules: `text frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in Parakeet, see exapmle in `Parakeet/examples/text_frontend/`.
+
+A text frontend module mainly includes:
+ - Text Segmentation
+ - Text Normalization (TN)
+ - Word Segmentation (mainly in Chinese)
+ - Part-of-Speech
+ - Prosody
+ - G2P (Grapheme-to-Phoneme, include Polyphone and Tone Sandhi, etc.)
+ - Linguistic Features/Charactors/Phonemes
+
+```text
+• text: 90 后为中华人民共和国成立 70 周年准备了大礼
+• Text Normalization: 九零后为中华人民共和国成立七十周年准备了大礼
+• Word Segmentation: 九零后/为/中华人民/共和国/成立/七十/周年/准备/了/大礼
+• G2P:
+    jiu3 ling2 hou4 wei4 zhong1 hua2 ren2 min2 gong4 he2 guo2 ...
+• Prosody (prosodic words #1, prosodic phrases #2, intonation phrases #3, sentence #4):
+    九零后#1为中华人民#1共和国#2成立七十周年#3准备了大礼#4
+```
+
+Among them, Text Normalization and G2P are the most important modules. We mainly introduce them here.
+
+## Text Normalization
+### Supported NSW (Non-Standard-Word) Normalization
+
+|NSW type|raw|normalized|
+|:--|:-|:-|
+|serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
+|cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
+|numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
+|date|她出生于86年8月18日，她弟弟出生于1995年3月1日|她出生于八六年八月十八日， 她弟弟出生于一九九五年三月一日|
+|time|等会请在12:05请通知我|等会请在十二点零五分请通知我
+|temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
+|fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
+|percentage|明天有62％的概率降雨|明天有百分之六十二的概率降雨|
+|money|随便来几个价格12块5，34.5元，20.1万|随便来几个价格十二块五，三十四点五元，二十点一万|
+|telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
+
+
+## Grapheme-to-Phoneme
+In Chinese, G2P is a very complex module, which mainly includes **polyphone**  and **tone sandhi**.
+
+We use [g2pM](https://github.com/kakaobrain/g2pM) and [pypinyin](https://github.com/mozillazg/python-pinyin)  as the defalut g2p tools. They can solve the problem of polyphone to a certain extent. In the future, we intend to use a trainable language model (for example, [BERT](https://arxiv.org/abs/1810.04805)) for polyphone.
+
+However, g2pM and pypinyin do not perform well in tone sandhi, we use rules to solve this problem, which requires relevant linguistic knowledge.
+
+The **tone sandhi** in Chinese mainly include:
+
+ - soft tone sandhi (轻声变调)
+ - "一" "不" tone sandhi ("一" "不" 变调)
+ - three tone sandhi  (三声变调)
+
+For ease of understanding, we list the tone sandhi rules in Chinese here
+### 1. 轻声变调
+|  |cases  |
+|:--|:-|
+| 语气助词“吧、呢、啊”等 | 吃吧、走吗、去呢、跑啊 |
+| 结构助词：“的、地、得”| 我的书、慢慢地走、跑得很快等 |
+|有的轻声音节和非轻声音节构成对比区别意义 |买卖：一指生意；二指买和卖。 <br/> 地道：一指纯粹、真正；二指地下通道。<br> 大意：一指没有注意；二指主要的意思。 <br/>  东西：一指各种事物；二指东面与西面。<br>  言语：一指所说的话；二指开口，招呼。<br/>运气：一指一种锻炼的方法。二指幸运。<br> |
+|名词的后缀：“们、子、头”|你们、房子、石头 |
+|名词或动词的第二个重叠音节 | 奶奶、姐姐、爸爸、试试、看看、说说、问问 |
+|名词后面表示方位的：“上、下、里” |桌上、地下、院里  |
+| 动态助词：“了、着、过” | 走了、看着、去过|
+| 作宾语的人称代词：“我、你、他” | 找我、请你、麻烦他。 |
+| 约定俗成 | 匀称、盘算、枇杷、篱笆、活泼、玄乎。狐狸、学生、拾掇、麻烦、蛤蟆、石榴。玫瑰、凉快、萝卜、朋友、奴才、云彩。脑袋、老爷、老婆、嘴巴、指头、指甲。委屈、喇叭、讲究、打发、打听、喜欢。点心、伙计、打扮、哑巴、女婿、首饰。自在、吓唬、力气、漂亮、队伍、地方。痛快、念叨、笑语、丈夫、志气、钥匙。月亮、正经、位置、秀气、上司、悟性。告示、动静、热闹、屁股、阔气、意思。等 |
+
+
+### 2. "一" "不" 变调
+#### "一" 变调
+|  | 是否变调 | cases|
+|:--|:-|:-|
+| 单独念 | 否 | 第一、一楼|
+| 序数 |否  | |
+| 用在语句末尾 | 否 | |
+| 去声前变阳平（四声前变二声） |  | 一栋yí dòng、一段yí duàn、一律yí lǜ、一路yí lù|
+| 非去声前变去声（非四声前变四声） |  | 阴平（一声）<br>一发yì fā 、一端yì duān、一天yì tiān、一忽yì hū<br>阳平（二声）<br>一叠yì dié 、一同yì tóng 、一头yì tóu 、一条yì tiáo<br>上声（三声）<br>一统yì tǒng、一体yì tǐ、一览yì lǎn、一口yì kǒu|
+|轻读，当“一”嵌在重叠式的动词之间  |  | 听一听 tīng yi tīng|
+
+#### "不" 变调
+|  | 是否变调 | cases|
+|:--|:-|:-|
+|单独念|否  | |
+| 用在语句末尾| 否  | 我不|
+|去声前变阳平（四声前变成二声）  |  | 不怕bú pà、不妙bú miào、不犯bú fàn、不忿bú fèn|
+| 轻读，不”夹在重叠动词或重叠形容词之间、夹在动词和补语之间 |  |懂不懂 dǒng bu dǒng 、看不清 kàn bu qīng |
+
+
+### 3. 三声变调
+|  | 子类别| 如何变调|cases|
+|:--|:-|:-|:-|
+|单独念 |  | 否|  |
+|句末 |  | 否|  |
+|在句中停顿并没被后音节影响  |  |否 |  |
+|三声+三声  |  | 二声+三声|保险、保养、党委、尽管、老板、本领、引导、古老、敏感、鼓舞、永远、语法、口语、岛屿、保姆、远景、北海、首长、母语 |
+| 三个三声相连| 双音节+单音节（“双单格”结构）| 前两个变二声|演讲稿、跑马场、展览馆、管理组、水彩笔、蒙古语、选取法、古典舞、虎骨酒、洗脸水、草稿纸|
+|  | 单音节+双音节（“单双格”结构）|第二个变二声|史小姐、党小组、好小伙、跑百米、纸老虎、李厂长、老保姆、冷处理、很友好、小雨伞|
+|  | 单音节+单音节+单音节（“单三格”结构）| 前两个变二声| 软懒散、稳准狠|
+| 更多三声音节相连时|  | 按语意与若干二字组成三字组，然后按以上变调规律处理|岂有 / 此理。<br>请你 / 给我 / 打点儿 / 洗脸水。<br>手表厂 / 有五种 /好产品。|
+
+## References
+
+ - [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization)
+ - [声调篇｜这些“一、不”变调规律，你不得不知](https://zhuanlan.zhihu.com/p/36156170)
+ - [TTS前端模块中的普通话变调规则](https://zhuanlan.zhihu.com/p/65091429)
+ - [轻声和变调](https://wenku.baidu.com/view/ad2016d94693daef5ef73db1.html)
+ - [必读轻声词语表546条](http://www.chaziwang.com/article-view-504.html)
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
@ -0,0 +1,583 @@
+Audio Sample 
+==================
+
+The main processes of TTS include:
+
+1. Convert the original text into characters/phonemes, through ``text frontend`` module.
+
+2. Convert characters/phonemes into acoustic features , such as linear spectrogram, mel spectrogram, LPC features, etc. through ``Acoustic models``.
+
+3. Convert acoustic features into waveforms through ``Vocoders``.
+
+When training ``Tacotron2``、``TransformerTTS`` and ``WaveFlow``, we use English single speaker TTS dataset `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`_  by default. However, when training ``SpeedySpeech``, ``FastSpeech2`` and ``ParallelWaveGAN``, we use Chinese single speaker dataset `CSMSC <https://test.data-baker.com/data/index/source/>`_ by default. 
+
+In the future, ``Parakeet`` will mainly use Chinese TTS datasets for default examples.
+
+Here, we will display three types of audio samples:
+
+1. Analysis/synthesis (ground-truth spectrograms + Vocoder)
+
+2. TTS (Acoustic model + Vocoder)
+
+3. Chinese TTS with/without text frontend (mainly tone sandhi)
+
+Analysis/synthesis
+--------------------------
+
+Audio samples generated from ground-truth spectrograms with a vocoder.
+
+.. raw:: html
+    
+    <b>LJSpeech(English)</b>
+    <br>
+    </br>
+    <table>
+        <tr>
+            <th  align="left"> GT </th>
+            <th  align="left"> WaveFlow </th>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+    </table>
+    
+    <br>
+    </br>
+    <b>CSMSC(Chinese)</b>
+    <br>
+    </br>
+
+    <table>
+        <tr>
+            <th  align="left"> GT (convert to 24k) </th>
+            <th  align="left"> ParallelWaveGAN </th>
+        </tr>
+        <tr>
+           <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009901.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009902.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009903.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009904.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009905.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009901.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009902.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009903.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009904.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009905.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+    
+    </table>
+
+
+TTS
+-------------------
+
+Audio samples generated by a TTS system. Text is first transformed into spectrogram by a text-to-spectrogram model, then the spectrogram is converted into raw audio by a vocoder.
+
+.. raw:: html
+
+    <table>
+        <tr>
+            <th  align="left"> TransformerTTS + WaveFlow </th>
+            <th  align="left"> Tacotron2 + WaveFlow </th>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+    </table>
+
+    <table>
+        <tr>
+            <th  align="left"> SpeedySpeech + ParallelWaveGAN </th>
+            <th  align="left"> FastSpeech2 + ParallelWaveGAN </th>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+    </table>
+
+
+
+Chinese TTS with/without text frontend
+--------------------------------------
+
+We provide a complete Chinese text frontend module in ``Parakeet``. ``Text Normalization`` and ``G2P`` are the most important modules in text frontend, We assume that the texts are normalized already, and mainly compare ``G2P`` module here.
+
+We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
+
+.. raw:: html
+
+    <table>
+        <tr>
+            <th  align="left"> With Text Frontend </th>
+            <th  align="left"> Without Text Frontend </th>
+        </tr>
+        <tr>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/010.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/010.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+
+
+    <table>
--- a/docs/source/tts/index.rst
+++ b/docs/source/tts/index.rst
@ -0,0 +1,45 @@
+.. parakeet documentation master file, created by
+   sphinx-quickstart on Fri Sep 10 14:22:24 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Parakeet 
+====================================
+
+``parakeet`` is a deep learning based text-to-speech toolkit built upon ``paddlepaddle`` framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by `Baidu Research <http://research.baidu.com>`_ and other research groups. 
+
+``parakeet`` mainly consists of components below.
+
+#. Implementation of models and commonly used neural network layers.
+#. Dataset abstraction and common data preprocessing pipelines.
+#. Ready-to-run experiments.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Introduction
+
+   introduction
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Getting started
+
+   install
+   basic_usage
+   advanced_usage
+   cn_text_frontend
+   released_models
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Demos
+   
+   demo
+   
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/docs/source/tts/install.md
+++ b/docs/source/tts/install.md
@ -0,0 +1,47 @@
+# Installation
+## Install PaddlePaddle
+Parakeet requires PaddlePaddle as its backend. Note that 2.1.2 or newer versions of paddle is required.
+
+Since paddlepaddle has multiple packages depending on the device (cpu or gpu) and the dependency libraries, it is recommended to install a proper package of paddlepaddle with respect to the device and dependency library versons via `pip`.
+
+Installing paddlepaddle with conda or build paddlepaddle from source is also supported. Please refer to [PaddlePaddle installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html) for more details.
+
+Example instruction to install paddlepaddle via pip is listed below.
+
+### PaddlePaddle with GPU
+```python
+# CUDA10.1 的 PaddlePaddle
+python -m pip install paddlepaddle-gpu==2.1.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+# CUDA10.2 的 PaddlePaddle
+python -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple
+# CUDA11.0 的 PaddlePaddle
+python -m pip install paddlepaddle-gpu==2.1.2.post110 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+# CUDA11.2 的 PaddlePaddle
+python -m pip install paddlepaddle-gpu==2.1.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+```
+### PaddlePaddle with CPU
+```python
+python -m pip install paddlepaddle==2.1.2 -i https://mirror.baidu.com/pypi/simple
+```
+## Install libsndfile
+Experimemts in parakeet often involve audio and spectrum processing, thus `librosa` and `soundfile` are required. `soundfile` requires a extra C library `libsndfile`, which is not always handled by pip.
+
+For Windows and Mac users, `libsndfile` is also installed when installing `soundfile` via pip, but for Linux users, installing `libsndfile` via system package manager is required. Example commands for popular distributions are listed below.
+```bash
+# ubuntu, debian
+sudo apt-get install libsndfile1
+# centos, fedora
+sudo yum install libsndfile
+# openSUSE
+sudo zypper in libsndfile
+```
+For any problem with installtion of soundfile, please refer to [SoundFile](https://pypi.org/project/SoundFile/).
+## Install Parakeet
+There are two ways to install parakeet according to the purpose of using it.
+
+ 1. If you want to run experiments provided by parakeet or add new models and experiments, it is recommended to clone the project from github (Parakeet), and install it in editable mode.
+       ```python
+       git clone https://github.com/PaddlePaddle/Parakeet
+       cd Parakeet
+       pip install -e .
+       ```
--- a/docs/source/tts/introduction.md
+++ b/docs/source/tts/introduction.md
@ -0,0 +1,27 @@
+# Parakeet - PAddle PARAllel text-to-speech toolKIT
+
+## What is Parakeet?
+Parakeet is a deep learning based text-to-speech toolkit built upon paddlepaddle framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by Baidu Research and other research groups.
+
+## What can Parakeet do?
+Parakeet mainly consists of components below:
+- Implementation of models and commonly used neural network layers.
+- Dataset abstraction and common data preprocessing pipelines.
+- Ready-to-run experiments.
+
+Parakeet provides you with a complete TTS pipeline, including:
+- Text FrontEnd
+    - Rule based Chinese frontend.
+- Acoustic Models
+    - FastSpeech2
+    - SpeedySpeech
+    - TransformerTTS
+    - Tacotron2
+- Vocoders
+    - Parallel WaveGAN
+    - WaveFlow
+- Voice Cloning
+    - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
+    - GE2E
+
+Parakeet helps you to train TTS models with simple commands.
--- a/docs/source/tts/released_models.md
+++ b/docs/source/tts/released_models.md
@ -0,0 +1,295 @@
+# Released Models
+TTS system mainly includes three modules: `text frontend`, `Acoustic model` and `Vocoder`. We introduce a rule based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable models.
+
+The main processes of TTS include:
+1. Convert the original text into characters/phonemes, through `text frontend` module.
+2. Convert characters/phonemes into acoustic features , such as linear spectrogram, mel spectrogram, LPC features, etc. through `Acoustic models`.
+3. Convert acoustic features into waveforms through `Vocoders`.
+
+A simple text frontend module can be implemented by rules. Acoustic models and vocoders need to be trained. The models provided by Parakeet are acoustic models and vocoders.
+
+## Acoustic Models
+### Modeling Objectives of Acoustic Models
+Modeling the mapping relationship between text sequences and speech features：
+```text
+text X = {x1,...,xM}
+specch Y = {y1,...yN}
+```
+Modeling Objectives:
+```text
+Ω = argmax p(Y|X,Ω)
+```
+### Modeling process of Acoustic Models
+At present, there are two mainstream acoustic model structures.
+
+- Frame level acoustic model:
+   - Duration model (M Tokens - > N Frames).
+   - Acoustic decoder (N Frames - > N Frames).
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/frame_level_am.png" width=500 /> <br>
+</div>
+
+- Sequence to sequence acoustic model:
+    - M Tokens - > N Frames.
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/seq2seq_am.png" width=500 /> <br>
+</div>
+
+### Tacotron2
+ [Tacotron](https://arxiv.org/abs/1703.10135)  is the first end-to-end acoustic model based on deep learning, and it is also the most widely used acoustic model.
+
+[Tacotron2](https://arxiv.org/abs/1712.05884) is the Improvement of Tacotron.
+#### Tacotron
+**Features of Tacotron:**
+- Encoder.
+   - CBHG.
+   - Input: character sequence.
+- Decoder.
+    - Global soft attention.
+    - unidirectional RNN.
+    - Autoregressive teacher force training (input real speech feature).
+    - Multi frame prediction.
+    - CBHG postprocess.
+    - Vocoder: Griffin-Lim.
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/tacotron.png" width=700 /> <br>
+</div>
+
+**Advantage of Tacotron:**
+- No need for complex text frontend analysis modules.
+- No need for additional duration model.
+- Greatly simplifies the acoustic model construction process and reduces the dependence of speech synthesis tasks on domain knowledge.
+
+**Disadvantages of Tacotron:**
+- The CBHG  is complex and the amount of parameters is relatively large.
+- Global soft attention.
+- Poor stability for speech synthesis tasks.
+- In training, the less the number of speech frames predicted at each moment, the more difficult it is to train.
+-  Phase problem in Griffin-Lim casues speech distortion during wave reconstruction.
+- The autoregressive decoder cannot be stopped during the generation process.
+
+#### Tacotron2
+**Features of Tacotron2:**
+- Reduction of parameters.
+   - CBHG -> PostNet (3 Conv layers + BLSTM or 5 Conv layers).
+   - remove Attention RNN.
+- Speech distortion caused by Griffin-Lim.
+    - WaveNet.
+- Improvements of PostNet.
+   - CBHG -> 5 Conv layers.
+   -  The input and output of the PostNet calculate `L2` loss with real Mel spectrogram.
+   - Residual connection.
+- Bad stop in autoregressive decoder.
+   - Predict whether it should stop at each moment of decoding (stop token).
+   - Set a threshold to determine whether to stop generating when decoding.
+- Stability of attention.
+   - Location-aware attention.
+   - The alignment matrix of previous time is considered at the step `t` of decoder.
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/tacotron2.png" width=500 /> <br>
+</div>
+
+You can find Parakeet's tacotron2 example at `Parakeet/examples/tacotron2`.
+
+### TransformerTTS
+**Disadvantages of the Tacotrons:**
+- Encodr and decoder are relatively weak at global information modeling
+   - Vanishing gradient of RNN.
+   - Fixed-length context modeling problem in CNN kernel.
+- Training is relatively inefficient.
+- The attention is not robust enough and the stability is poor.
+
+Transformer TTS is a combination of Tacotron2 and Transformer.
+
+#### Transformer
+ [Transformer](https://arxiv.org/abs/1706.03762) is a seq2seq model based entirely on attention mechanism.
+
+**Features of Transformer:**
+- Encoder.
+    - `N` blocks based on self-attention mechanism.
+    - Positional Encoding.
+- Decoder.
+    - `N` blocks based on self-attention mechanism.
+    - Add Mask to the self-attention in blocks to cover up the information after `t` step.
+    - Attentions between encoder and decoder.
+    - Positional Encoding.
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/transformer.png" width=500 /> <br>
+</div>
+
+#### Transformer TTS
+Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
+
+**Motivations：**
+- RNNs in Tacotron2  make the inefficiency of training.
+- Vanishing gradient of RNN makes the model's ability to model long-term contexts weak.
+- Self-attention doesn't contain any recursive structure which can be trained in parallel.
+- Self-attention can model global context information well.
+
+**Features of Transformer TTS:**
+- Add conv based PreNet in encoder and decoder.
+- Stop Token in decoder controls when to stop autoregressive generation.
+- Add PostNet after decoder to improve the quality of synthetic speech.
+- Scaled position encoding.
+    - Uniform scale position encoding may have a negative impact on input or output sequences.
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/transformer_tts.png" width=500 /> <br>
+</div>
+
+**Disadvantages of Transformer TTS:**
+- The ability of position encoding for timing information is still relatively weak.
+- The ability to perceive local information is weak, and local information is more related to pronunciation.
+- Stability is worse than Tacotron2.
+
+You can find Parakeet's Transformer TTS example at `Parakeet/examples/transformer_tts`.
+
+
+### FastSpeech2
+**Disadvantage of seq2seq models:**
+- In the seq2seq model based on attention, no matter how to improve the attention mechanism, it's difficult to avoid generation errors in the decoding stage.
+
+Frame level acoustic models use duration models to determine the pronunciation duration of phonemes, and the frame level mapping does not have the uncertainty of sequence generation.
+
+In seq2saq models, the concept of duration models is used as the alignment module of two sequences to replace attention, which can avoid the uncertainty in attention, and significantly improve the stability of the seq2saq models.
+
+#### FastSpeech
+Instead of using the encoder-attention-decoder based architecture as adopted by most seq2seq based autoregressive and non-autoregressive generation, [FastSpeech](https://arxiv.org/abs/1905.09263) is  a novel feed-forward structure, which can generate a target mel spectrogram sequence in parallel.
+
+**Features of FastSpeech:**
+- Encoder: based on Transformer.
+- Change `FFN` to `CNN` in self-attention.
+    -  Model local dependency.
+- Length regulator.
+    - Use real phoneme durations to expand output frame of encoder during training.
+- Non autoregressive decode.
+    -  Improve generation efficiency.
+
+**Length predictor:**
+- Pretrain a TransformerTTS model.
+- Get alignment matrix of train data.
+- Caculate the phoneme durations according to the probability of the alignment matrix.
+- Use the output of encoder to predict the phoneme durations and calculate the MSE loss.
+- Use real phoneme durations to expand output frame of encoder during training.
+- Use phoneme durations predicted by the duration model to expand the frame during prediction.
+    - Attentrion can not control phoneme durations. The explicit duration modeling can control durations through duration coefficient (duration coefficient is `1` during training).
+
+**Advantages of non-autoregressive decoder:**
+- The built-in duration model of the seq2seq model has converted the input length `M` to the output length `N`.
+- The length of output is known, `stop token` is no longer used, avoiding the problem of being unable to stop.
+• Can be generated in parallel (decoding time is less affected by sequence length)
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastspeech.png" width=800 /> <br>
+</div>
+
+#### FastPitch
+[FastPitch](https://arxiv.org/abs/2006.06873) follows FastSpeech. A single pitch value is predicted for every temporal location, which improves the overall quality of synthesized speech.
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastpitch.png" width=500 /> <br>
+</div>
+
+#### FastSpeech2
+**Disadvantages of FastSpeech:**
+- The teacher-student distillation pipeline is complicated and time-consuming.
+- The duration extracted from the teacher model is not accurate enough.
+- The target mel spectrograms distilled from teacher model suffer from information loss due to data simplification.
+
+[FastSpeech2](https://arxiv.org/abs/2006.04558)  addresses the issues in FastSpeech and better solves the one-to-many mapping problem in TTS.
+
+**Features of FastSpeech2:**
+- Directly training the model with ground-truth target instead of the simplified output from teacher.
+- Introducing more variation information of speech as conditional inputs, extract `duration`, `pitch` and `energy` from speech waveform and directly take them as conditional inputs in training and use predicted values in inference.
+
+FastSpeech2 is similar to FastPitch but introduces more variation information of speech.
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastspeech2.png" width=800 /> <br>
+</div>
+
+You can find Parakeet's FastSpeech2/FastPitch example at `Parakeet/examples/fastspeech2`, We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.
+
+### SpeedySpeech
+[SpeedySpeech](https://arxiv.org/abs/2008.03802) simplify the teacher-student architecture of FastSpeech and provide a fast and stable training procedure.
+
+**Features of SpeedySpeech:**
+- Use a simpler, smaller and faster-to-train convolutional teacher model ([Deepvoice3](https://arxiv.org/abs/1710.07654) and [DCTTS](https://arxiv.org/abs/1710.08969)) with a single attention layer instead of Transformer used in FastSpeech.  
+- Show that self-attention layers in the student network are not needed for high-quality speech synthesis.
+- Describe a simple data augmentation technique that can be used early in the training to make the teacher network robust to sequential error propagation.
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/speedyspeech.png" width=500 /> <br>
+</div>
+
+You can find Parakeet's SpeedySpeech example at `Parakeet/examples/speedyspeech/baker`.
+
+## Vocoders
+In speech synthesis, the main task of the vocoder is to convert the spectral parameters predicted by the acoustic model into the final speech waveform.
+
+Taking into account the short-term change frequency of the waveform, the acoustic model usually avoids direct modeling of the speech waveform, but firstly models the spectral features extracted from the speech waveform, and then reconstructs the waveform by the decoding part of the vocoder.
+
+A vocoder usually consists of a pair of encoders and decoders for speech analysis and synthesis. The encoder estimate the parameters, and then the decoder restores the speech.
+
+Vocoders based on neural networks usually is speech synthesis, which learns the mapping relationship from spectral features to waveforms through training data.
+
+### Categories of neural vocodes
+- Autoregression
+    - WaveNet
+    - WaveRNN
+    - LPCNet
+
+- Flow
+    - **WaveFlow**
+    - WaveGlow
+    - FloWaveNet
+    - Parallel WaveNet
+- GAN
+    - WaveGAN
+    - **Parallel WaveGAN**
+    - MelGAN
+    - HiFi-GAN
+- VAE
+    - Wave-VAE
+- Diffusion
+    - WaveGrad
+    - DiffWave
+
+**Motivations of GAN-based vocoders:**
+- Modeling speech signal by estimating probability distribution usually has high requirements for the expression ability of the model itself. In addition, specific assumptions need to be made about the distribution of waveforms.
+- Although autoregressive neural vocoders can obtain high-quality synthetic speech, such models usually have a **slow generation speed**.
+- The training of inverse autoregressive flow vocoders is complex, and they also require the modeling capability of long term context information.
+- Vocoders based on Bipartite Transformation converge slowly and are complex.
+- GAN-based vocoders don't need to make assumptions about the speech distribution, and train through adversarial learning.
+
+Here, we introduce a Flow-based vocoder WaveFlow and a GAN-based vocoder Parallel WaveGAN.
+
+### WaveFlow
+ [WaveFlow](https://arxiv.org/abs/1912.01219) is proposed by Baidu Research.
+
+**Features of WaveFlow:**
+- It can synthesize 22.05 kHz high-fidelity speech around 40x faster than real-time on a Nvidia V100 GPU without engineered inference kernels, which is faster than [WaveGlow](https://github.com/NVIDIA/waveglow) and serveral orders of magnitude faster than WaveNet.
+- It is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M).
+- It is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in [Parallel WaveNet](https://arxiv.org/abs/1711.10433) and [ClariNet](https://openreview.net/pdf?id=HklY120cYm), which simplifies the training pipeline and reduces the cost of development.
+
+You can find Parakeet's WaveFlow example at `Parakeet/examples/waveflow`.
+
+### Parallel WaveGAN
+[Parallel WaveGAN](https://arxiv.org/abs/1910.11480) trains a non-autoregressive WaveNet variant as a generator in a GAN based training method.
+
+**Features of Parallel WaveGAN:**
+
+- Use non-causal convolution instead of causal convolution.
+- The input is random Gaussian white noise.
+- The model is non-autoregressive both in training and prediction, which is fast
+-  Multi-resolution STFT loss.
+
+<div align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/pwg.png" width=600 /> <br>
+</div>
+
+You can find Parakeet's Parallel WaveGAN example at `Parakeet/examples/parallelwave_gan/baker`.
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@ -5,7 +5,8 @@

 ## Data

-| Data Subset | Duration in Seconds |
-| data/manifest.train |  1.23 ~ 14.53125 |
-| data/manifest.dev  | 1.645 ~ 12.533 |  
-| data/manifest.test | 1.859125 ~ 14.6999375 |
+| Data Subset         | Duration in Seconds   |
+| ------------------- | --------------------- |
+| data/manifest.train | 1.23 ~ 14.53125       |
+| data/manifest.dev   | 1.645 ~ 12.533        |
+| data/manifest.test  | 1.859125 ~ 14.6999375 |
--- a/examples/aishell/s1/local/test_hub.sh
+++ b/examples/aishell/s1/local/test_hub.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+audio_file=$3
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+
+
+for type in  attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test_hub.py \
+    --nproc ${ngpu} \
+    --config ${config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size} \
+    --audio_file ${audio_file}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@ -1,6 +1,6 @@
 #!/bin/bash
-set -e
 source path.sh
+set -e

 stage=0
 stop_stage=100
@ -13,6 +13,8 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"

+audio_file="data/tmp.wav"
+
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
@ -46,5 +48,10 @@ fi
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    # train lm and build TLG
-    ./local/tlg.sh --corpus aishell --lmtype srilm 
+    ./local/tlg.sh --corpus aishell --lmtype srilm
 fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
--- a/examples/aishell3/README.md
+++ b/examples/aishell3/README.md
@ -0,0 +1,4 @@
+# Aishell3
+
+* tts0 - fastspeech2
+* vc0 - tactron2 voice clone
--- a/examples/aishell3/tts0/run.sh
+++ b/examples/aishell3/tts0/run.sh
--- a/examples/aishell3/vc0/local/tacotron2/README_cn.md
+++ b/examples/aishell3/vc0/local/tacotron2/README_cn.md
@ -0,0 +1,112 @@
+## Tacotron2 + AISHELL-3 数据集训练语音克隆模型
+
+本实验的内容是利用 AISHELL-3 数据集和 Tacotron 2 模型进行语音克隆任务，使用的模型大体结构和论文 [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) 相同。大致步骤如下：
+
+1. Speaker Encoder: 我们使用了一个 Speaker Verification 任务训练一个 speaker encoder。这部分任务所用的数据集和训练 Tacotron 2 的数据集不同，因为不需要 transcription 的缘故，我们使用了较多的训练数据，可以参考实现 [ge2e](../ge2e)。
+2. Synthesizer: 然后使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 这个 Embedding 作为 Tacotron 模型中的一个额外输入和 encoder outputs 拼接在一起。
+3. Vocoder: 我们使用的声码器是 WaveFlow，参考实验 [waveflow](../waveflow).
+
+## 数据处理
+
+### utterance embedding 的生成
+
+使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 以和音频文件夹同构的方式存储。存储格式是 `.npy` 文件。
+
+首先 cd 到 [ge2e](../ge2e) 文件夹。下载训练好的 [模型](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)，然后运行脚本生成每个句子的 utterance embedding.
+
+```bash
+python inference.py --input=<intput> --output=<output> --device="gpu" --checkpoint_path=<pretrained checkpoint>
+```
+
+其中 input 是只包含音频文件夹的文件。这里可以用 `~/datasets/aishell3/train/wav`，然后 output 是用于存储 utterance embed 的文件夹，这里可以用 `~/datasets/aishell3/train/embed`。Utterance embedding 会以和音频文件夹相同的文件结构存储，格式为 `.npy`.
+
+utterance embedding 的计算可能会用几个小时的时间，请耐心等待。
+
+### 音频处理
+
+因为 AISHELL-3 数据集前后有一些空白，静音片段，而且语音幅值很小，所以我们需要进行空白移除和音量规范化。空白移除可以简单的使用基于音量或者能量的方法，但是效果不是很好，对于不同的句子很难取到一个一致的阈值。我们使用的是先利用 Force Aligner 进行文本和语音的对齐。然后根据对齐结果截除空白。
+
+我们使用的工具是 Montreal Force Aligner 1.0. 因为 aishell 的标注包含拼音标注，所以我们提供给 Montreal Force Aligner 的是拼音 transcription 而不是汉字 transcription. 而且需要把其中的韵律标记(`$` 和 `%`)去除，并且处理成 Montreal Force Alinger 所需要的文件形式。和音频同名的文本文件，扩展名为 `.lab`.
+
+此外还需要准备词典文件。其中包含把拼音序列转换为 phone 序列的映射关系。在这里我们只做声母和韵母的切分，而声调则归为韵母的一部分。我们使用的[词典文件](./lexicon.txt)可以下载。
+
+准备好之后运行训练和对齐。首先下载 [Montreal Force Aligner 1.0](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/tag/v1.0.1).下载之后解压即可运行。cd 到其中的 bin 文件夹运行命令，即可进行训练和对齐。前三个命令行参数分别是音频文件夹的路径，词典路径和对齐文件输出路径。可以通过`-o` 传入训练得到的模型保存路径。
+
+```bash
+./mfa_train_and_align \
+  ~/datasets/aishell3/train/wav \
+  lexicon.txt \
+  ~/datasets/aishell3/train/alignment \
+  -o aishell3_model \
+  -v
+```
+
+因为训练和对齐的时间比较长。我们提供了对齐后的 [alignment 文件](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz)，其中每个句子对应的文件为 `.TextGrid` 格式的文本。
+
+得到了对齐文件之后，可以运行 `process_wav.py` 脚本来处理音频。
+
+```bash
+python process_wav.py --input=<input> --output=<output> --alignment=<alignment>
+```
+
+默认 input, output, alignment 分别是 `~/datasets/aishell3/train/wav`, `~/datasets/aishell3/train/normalized_wav`, `~/datasets/aishell3/train/alignment`.
+
+处理结束后，会将处理好的音频保存在 `<output>` 文件夹中。
+
+### 转录文本处理
+
+把文本转换成为 phone 和 tone 的形式，并存储起来。值得注意的是，这里我们的处理和用于 montreal force aligner 的不一样。我们把声调分了出来。这是一个处理方式，当然也可以只做声母和韵母的切分。
+
+运行脚本处理转录文本。
+
+```bash
+python preprocess_transcription.py --input=<input> --output=<output>
+```
+
+默认的 input 是 `~/datasets/aishell3/train`，其中会包含 `label_train-set.txt` 文件，处理后的结果会 `metadata.yaml` 和 `metadata.pickle`. 前者是文本格式，方便查看，后者是二进制格式，方便直接读取。
+
+### mel 频谱提取
+
+对处理后的音频进行 mel 频谱的提取，并且以和音频文件夹同构的方式存储，存储格式是 `.npy` 文件。
+
+```python
+python extract_mel.py --input=<intput> --output=<output>
+```
+
+input 是处理后的音频所在的文件夹，output 是输出频谱的文件夹。
+
+## 训练
+
+运行脚本训练。
+
+```python
+python train.py --data=<data> --output=<output> --device="gpu"
+```
+
+我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题，每个句子可能有几百帧对应负样例，只有一帧正样例，而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。
+
+另外，为了加速模型的收敛，我们加上了 guided attention loss, 诱导 encoder-decoder 之间的 alignment 更快地呈现对角线。
+
+可以使用 visualdl 查看训练过程的 log。
+
+```bash
+visualdl --logdir=<output> --host=$HOSTNAME
+```
+
+示例 training loss / validation loss 曲线如下。
+
+![train](./images/train.png)
+
+![valid](./images/valid.png)
+
+<img src="images/alignment-step2000.png" alt="alignment-step2000" style="zoom:50%;" />
+
+大约从训练 2000 步左右就从 validation 过程中产出的 alignement 中可以观察到模糊的对角线。随着训练步数增加，对角线会更加清晰。但因为 validation 也是以 teacher forcing 的方式进行的，所以要在真正的 auto regressive 合成中产出的 alignment 中观察到对角线，需要更长的时间。
+
+## 预训练模型
+
+预训练模型下载链接。[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
+
+## 使用
+
+本实验包含了一个简单的使用示例，用户可以替换作为参考的声音以及文本，用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
--- a/examples/aishell3/vc0/local/tacotron2/aishell3.py
+++ b/examples/aishell3/vc0/local/tacotron2/aishell3.py
@ -0,0 +1,88 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+from pathlib import Path
+
+import numpy as np
+from paddle.io import Dataset
+from parakeet.frontend import Vocab
+from parakeet.data import batch_text_id, batch_spec
+
+from preprocess_transcription import _phones, _tones
+
+voc_phones = Vocab(sorted(list(_phones)))
+print("vocab_phones:\n", voc_phones)
+voc_tones = Vocab(sorted(list(_tones)))
+print("vocab_tones:\n", voc_tones)
+
+
+class AiShell3(Dataset):
+    """Processed AiShell3 dataset."""
+
+    def __init__(self, root):
+        super().__init__()
+        self.root = Path(root).expanduser()
+        self.embed_dir = self.root / "embed"
+        self.mel_dir = self.root / "mel"
+
+        with open(self.root / "metadata.pickle", 'rb') as f:
+            self.records = pickle.load(f)
+
+    def __getitem__(self, index):
+        metadatum = self.records[index]
+        sentence_id = metadatum["sentence_id"]
+        speaker_id = sentence_id[:7]
+        phones = metadatum["phones"]
+        tones = metadatum["tones"]
+        phones = np.array(
+            [voc_phones.lookup(item) for item in phones], dtype=np.int64)
+        tones = np.array(
+            [voc_tones.lookup(item) for item in tones], dtype=np.int64)
+        mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
+        embed = np.load(
+            str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
+        return phones, tones, mel, embed
+
+    def __len__(self):
+        return len(self.records)
+
+
+def collate_aishell3_examples(examples):
+    phones, tones, mel, embed = list(zip(*examples))
+
+    text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
+    spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
+    T_dec = np.max(spec_lengths)
+    stop_tokens = (
+        np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
+    phones, _ = batch_text_id(phones)
+    tones, _ = batch_text_id(tones)
+    mel, _ = batch_spec(mel)
+    mel = np.transpose(mel, (0, 2, 1))
+    embed = np.stack(embed)
+    # 7 fields
+    # (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
+    return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
+
+
+if __name__ == "__main__":
+    dataset = AiShell3("~/datasets/aishell3/train")
+    example = dataset[0]
+
+    examples = [dataset[i] for i in range(10)]
+    batch = collate_aishell3_examples(examples)
+
+    for field in batch:
+        print(field.shape, field.dtype)
--- a/examples/aishell3/vc0/local/tacotron2/chinese_g2p.py
+++ b/examples/aishell3/vc0/local/tacotron2/chinese_g2p.py
@ -0,0 +1,39 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple
+from pypinyin import lazy_pinyin, Style
+from preprocess_transcription import split_syllable
+
+
+def convert_to_pinyin(text: str) -> List[str]:
+    """convert text into list of syllables, other characters that are not chinese, thus
+    cannot be converted to pinyin are splited.
+    """
+    syllables = lazy_pinyin(
+        text, style=Style.TONE3, neutral_tone_with_five=True)
+    return syllables
+
+
+def convert_sentence(text: str) -> List[Tuple[str]]:
+    """convert a sentence into two list: phones and tones"""
+    syllables = convert_to_pinyin(text)
+    phones = []
+    tones = []
+    for syllable in syllables:
+        p, t = split_syllable(syllable)
+        phones.extend(p)
+        tones.extend(t)
+
+    return phones, tones
--- a/examples/aishell3/vc0/local/tacotron2/config.py
+++ b/examples/aishell3/vc0/local/tacotron2/config.py
@ -0,0 +1,82 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from yacs.config import CfgNode as CN
+
+_C = CN()
+_C.data = CN(
+    dict(
+        batch_size=32,  # batch size
+        valid_size=64,  # the first N examples are reserved for validation
+        sample_rate=22050,  # Hz, sample rate
+        n_fft=1024,  # fft frame size
+        win_length=1024,  # window size
+        hop_length=256,  # hop size between ajacent frame
+        fmax=8000,  # Hz, max frequency when converting to mel
+        fmin=0,  # Hz, min frequency when converting to mel
+        d_mels=80,  # mel bands
+        padding_idx=0,  # text embedding's padding index
+    ))
+
+_C.model = CN(
+    dict(
+        vocab_size=70,
+        n_tones=10,
+        reduction_factor=1,  # reduction factor
+        d_encoder=512,  # embedding & encoder's internal size
+        encoder_conv_layers=3,  # number of conv layer in tacotron2 encoder
+        encoder_kernel_size=5,  # kernel size of conv layers in tacotron2 encoder
+        d_prenet=256,  # hidden size of decoder prenet
+        # hidden size of the first rnn layer in tacotron2 decoder
+        d_attention_rnn=1024,
+        # hidden size of the second rnn layer in tacotron2 decoder
+        d_decoder_rnn=1024,
+        d_attention=128,  # hidden size of  decoder location linear layer
+        attention_filters=32,  # number of filter in decoder location conv layer
+        attention_kernel_size=31,  # kernel size of decoder location conv layer
+        d_postnet=512,  # hidden size of decoder postnet
+        postnet_kernel_size=5,  # kernel size of conv layers in postnet
+        postnet_conv_layers=5,  # number of conv layer in decoder postnet
+        p_encoder_dropout=0.5,  # droput probability in encoder
+        p_prenet_dropout=0.5,  # droput probability in decoder prenet
+
+        # droput probability of first rnn layer in decoder
+        p_attention_dropout=0.1,
+        # droput probability of second rnn layer in decoder
+        p_decoder_dropout=0.1,
+        p_postnet_dropout=0.5,  # droput probability in decoder postnet
+        guided_attention_loss_sigma=0.2,
+        d_global_condition=256,
+
+        # whether to use a classifier to predict stop probability
+        use_stop_token=False,
+        # whether to use guided attention loss in training
+        use_guided_attention_loss=True, ))
+
+_C.training = CN(
+    dict(
+        lr=1e-3,  # learning rate
+        weight_decay=1e-6,  # the coeff of weight decay
+        grad_clip_thresh=1.0,  # the clip norm of grad clip.
+        valid_interval=1000,  # validation
+        save_interval=1000,  # checkpoint
+        max_iteration=500000,  # max iteration to train
+    ))
+
+
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    return _C.clone()
--- a/examples/aishell3/vc0/local/tacotron2/extract_mel.py
+++ b/examples/aishell3/vc0/local/tacotron2/extract_mel.py
@ -0,0 +1,96 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import multiprocessing as mp
+from functools import partial
+from pathlib import Path
+
+import numpy as np
+from parakeet.audio import AudioProcessor
+from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
+
+import tqdm
+
+from config import get_cfg_defaults
+
+
+def extract_mel(fname: Path,
+                input_dir: Path,
+                output_dir: Path,
+                p: AudioProcessor,
+                n: NormalizerBase):
+    relative_path = fname.relative_to(input_dir)
+    out_path = (output_dir / relative_path).with_suffix(".npy")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    wav = p.read_wav(fname)
+    mel = p.mel_spectrogram(wav)
+    mel = n.transform(mel)
+    np.save(out_path, mel)
+
+
+def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
+    input_dir = Path(input_dir).expanduser()
+    fnames = list(input_dir.rglob(f"*{extension}"))
+    output_dir = Path(output_dir).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
+                       config.hop_length, config.n_mels, config.fmin,
+                       config.fmax)
+    n = LogMagnitude(1e-5)
+
+    func = partial(
+        extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
+
+    with mp.Pool(16) as pool:
+        list(
+            tqdm.tqdm(
+                pool.imap(func, fnames), total=len(fnames), unit="utterance"))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        help="yaml config file to overwrite the default config")
+    parser.add_argument(
+        "--input",
+        type=str,
+        default="~/datasets/aishell3/train/normalized_wav",
+        help="path of the processed wav folder")
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="~/datasets/aishell3/train/mel",
+        help="path of the folder to save mel spectrograms")
+    parser.add_argument(
+        "--opts",
+        nargs=argparse.REMAINDER,
+        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
+    )
+    default_config = get_cfg_defaults()
+
+    args = parser.parse_args()
+    if args.config:
+        default_config.merge_from_file(args.config)
+    if args.opts:
+        default_config.merge_from_list(args.opts)
+    default_config.freeze()
+    audio_config = default_config.data
+
+    extract_mel_multispeaker(audio_config, args.input, args.output)
--- a/examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png
+++ b/examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png
--- a/examples/aishell3/vc0/local/tacotron2/images/train.png
+++ b/examples/aishell3/vc0/local/tacotron2/images/train.png
--- a/examples/aishell3/vc0/local/tacotron2/images/valid.png
+++ b/examples/aishell3/vc0/local/tacotron2/images/valid.png
--- a/examples/aishell3/vc0/local/tacotron2/lexicon.txt
+++ b/examples/aishell3/vc0/local/tacotron2/lexicon.txt
--- a/examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py
+++ b/examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py
@ -0,0 +1,258 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from pathlib import Path
+import re
+import pickle
+
+import yaml
+import tqdm
+
+zh_pattern = re.compile("[\u4e00-\u9fa5]")
+
+_tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
+
+_pauses = {'%', '$'}
+
+_initials = {
+    'b',
+    'p',
+    'm',
+    'f',
+    'd',
+    't',
+    'n',
+    'l',
+    'g',
+    'k',
+    'h',
+    'j',
+    'q',
+    'x',
+    'zh',
+    'ch',
+    'sh',
+    'r',
+    'z',
+    'c',
+    's',
+}
+
+_finals = {
+    'ii',
+    'iii',
+    'a',
+    'o',
+    'e',
+    'ea',
+    'ai',
+    'ei',
+    'ao',
+    'ou',
+    'an',
+    'en',
+    'ang',
+    'eng',
+    'er',
+    'i',
+    'ia',
+    'io',
+    'ie',
+    'iai',
+    'iao',
+    'iou',
+    'ian',
+    'ien',
+    'iang',
+    'ieng',
+    'u',
+    'ua',
+    'uo',
+    'uai',
+    'uei',
+    'uan',
+    'uen',
+    'uang',
+    'ueng',
+    'v',
+    've',
+    'van',
+    'ven',
+    'veng',
+}
+
+_ernized_symbol = {'&r'}
+
+_specials = {'<pad>', '<unk>', '<s>', '</s>'}
+
+_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
+
+
+def is_zh(word):
+    global zh_pattern
+    match = zh_pattern.search(word)
+    return match is not None
+
+
+def ernized(syllable):
+    return syllable[:2] != "er" and syllable[-2] == 'r'
+
+
+def convert(syllable):
+    # expansion of o -> uo
+    syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
+    # syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
+    # expansion for iong, ong
+    syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
+
+    # expansion for ing, in
+    syllable = syllable.replace("ing", "ieng").replace("in", "ien")
+
+    # expansion for un, ui, iu
+    syllable = syllable.replace("un", "uen").replace("ui",
+                                                     "uei").replace("iu", "iou")
+
+    # rule for variants of i
+    syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
+        .replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
+        .replace("ri", "riii")
+
+    # rule for y preceding i, u
+    syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
+
+    # rule for w
+    syllable = syllable.replace("wu", "u").replace("w", "u")
+
+    # rule for v following j, q, x
+    syllable = syllable.replace("ju", "jv").replace("qu",
+                                                    "qv").replace("xu", "xv")
+
+    return syllable
+
+
+def split_syllable(syllable: str):
+    """Split a syllable in pinyin into a list of phones and a list of tones.
+    Initials have no tone, represented by '0', while finals have tones from
+    '1,2,3,4,5'.
+
+    e.g.
+
+    zhang -> ['zh', 'ang'], ['0', '1']
+    """
+    if syllable in _pauses:
+        # syllable, tone
+        return [syllable], ['0']
+
+    tone = syllable[-1]
+    syllable = convert(syllable[:-1])
+
+    phones = []
+    tones = []
+
+    global _initials
+    if syllable[:2] in _initials:
+        phones.append(syllable[:2])
+        tones.append('0')
+        phones.append(syllable[2:])
+        tones.append(tone)
+    elif syllable[0] in _initials:
+        phones.append(syllable[0])
+        tones.append('0')
+        phones.append(syllable[1:])
+        tones.append(tone)
+    else:
+        phones.append(syllable)
+        tones.append(tone)
+    return phones, tones
+
+
+def load_aishell3_transcription(line: str):
+    sentence_id, pinyin, text = line.strip().split("|")
+    syllables = pinyin.strip().split()
+
+    results = []
+
+    for syllable in syllables:
+        if syllable in _pauses:
+            results.append(syllable)
+        elif not ernized(syllable):
+            results.append(syllable)
+        else:
+            results.append(syllable[:-2] + syllable[-1])
+            results.append('&r5')
+
+    phones = []
+    tones = []
+    for syllable in results:
+        p, t = split_syllable(syllable)
+        phones.extend(p)
+        tones.extend(t)
+    for p in phones:
+        assert p in _phones, p
+    return {
+        "sentence_id": sentence_id,
+        "text": text,
+        "syllables": results,
+        "phones": phones,
+        "tones": tones
+    }
+
+
+def process_aishell3(dataset_root, output_dir):
+    dataset_root = Path(dataset_root).expanduser()
+    output_dir = Path(output_dir).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    prosody_label_path = dataset_root / "label_train-set.txt"
+    with open(prosody_label_path, 'rt') as f:
+        lines = [line.strip() for line in f]
+
+    records = lines[5:]
+
+    processed_records = []
+    for record in tqdm.tqdm(records):
+        new_record = load_aishell3_transcription(record)
+        processed_records.append(new_record)
+        print(new_record)
+
+    with open(output_dir / "metadata.pickle", 'wb') as f:
+        pickle.dump(processed_records, f)
+
+    with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
+        yaml.safe_dump(
+            processed_records, f, default_flow_style=None, allow_unicode=True)
+
+    print("metadata done!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        default="~/datasets/aishell3/train",
+        help="path of the training dataset,(contains a label_train-set.txt).")
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="the directory to save the processed transcription."
+        "If not provided, it would be the same as the input.")
+    args = parser.parse_args()
+    if args.output is None:
+        args.output = args.input
+
+    process_aishell3(args.input, args.output)
--- a/examples/aishell3/vc0/local/tacotron2/process_wav.py
+++ b/examples/aishell3/vc0/local/tacotron2/process_wav.py
@ -0,0 +1,95 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from pathlib import Path
+from multiprocessing import Pool
+from functools import partial
+
+import numpy as np
+import librosa
+import soundfile as sf
+from tqdm import tqdm
+from praatio import tgio
+
+
+def get_valid_part(fpath):
+    f = tgio.openTextgrid(fpath)
+
+    start = 0
+    phone_entry_list = f.tierDict['phones'].entryList
+    first_entry = phone_entry_list[0]
+    if first_entry.label == "sil":
+        start = first_entry.end
+
+    last_entry = phone_entry_list[-1]
+    if last_entry.label == "sp":
+        end = last_entry.start
+    else:
+        end = last_entry.end
+    return start, end
+
+
+def process_utterance(fpath, source_dir, target_dir, alignment_dir):
+    rel_path = fpath.relative_to(source_dir)
+    opath = target_dir / rel_path
+    apath = (alignment_dir / rel_path).with_suffix(".TextGrid")
+    opath.parent.mkdir(parents=True, exist_ok=True)
+
+    start, end = get_valid_part(apath)
+    wav, _ = librosa.load(fpath, sr=22050, offset=start, duration=end - start)
+    normalized_wav = wav / np.max(wav) * 0.999
+    sf.write(opath, normalized_wav, samplerate=22050, subtype='PCM_16')
+    # print(f"{fpath} => {opath}")
+
+
+def preprocess_aishell3(source_dir, target_dir, alignment_dir):
+    source_dir = Path(source_dir).expanduser()
+    target_dir = Path(target_dir).expanduser()
+    alignment_dir = Path(alignment_dir).expanduser()
+
+    wav_paths = list(source_dir.rglob("*.wav"))
+    print(f"there are {len(wav_paths)} audio files in total")
+    fx = partial(
+        process_utterance,
+        source_dir=source_dir,
+        target_dir=target_dir,
+        alignment_dir=alignment_dir)
+    with Pool(16) as p:
+        list(
+            tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Process audio in AiShell3, trim silence according to the alignment "
+        "files generated by MFA, and normalize volume by peak.")
+    parser.add_argument(
+        "--input",
+        type=str,
+        default="~/datasets/aishell3/train/wav",
+        help="path of the original audio folder in aishell3.")
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="~/datasets/aishell3/train/normalized_wav",
+        help="path of the folder to save the processed audio files.")
+    parser.add_argument(
+        "--alignment",
+        type=str,
+        default="~/datasets/aishell3/train/alignment",
+        help="path of the alignment files.")
+    args = parser.parse_args()
+
+    preprocess_aishell3(args.input, args.output, args.alignment)
--- a/examples/aishell3/vc0/local/tacotron2/train.py
+++ b/examples/aishell3/vc0/local/tacotron2/train.py
@ -0,0 +1,262 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from pathlib import Path
+from collections import defaultdict
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader, DistributedBatchSampler
+
+from parakeet.data import dataset
+from parakeet.training.cli import default_argument_parser
+from parakeet.training.experiment import ExperimentBase
+from parakeet.utils import display, mp_tools
+from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
+
+from config import get_cfg_defaults
+from aishell3 import AiShell3, collate_aishell3_examples
+
+
+class Experiment(ExperimentBase):
+    def compute_losses(self, inputs, outputs):
+        texts, tones, mel_targets, utterance_embeds, text_lens, output_lens, stop_tokens = inputs
+
+        mel_outputs = outputs["mel_output"]
+        mel_outputs_postnet = outputs["mel_outputs_postnet"]
+        alignments = outputs["alignments"]
+
+        losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
+                                alignments, output_lens, text_lens)
+        return losses
+
+    def train_batch(self):
+        start = time.time()
+        batch = self.read_batch()
+        data_loader_time = time.time() - start
+
+        self.optimizer.clear_grad()
+        self.model.train()
+        texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
+        outputs = self.model(
+            texts,
+            text_lens,
+            mels,
+            output_lens,
+            tones=tones,
+            global_condition=utterance_embeds)
+        losses = self.compute_losses(batch, outputs)
+        loss = losses["loss"]
+        loss.backward()
+        self.optimizer.step()
+        iteration_time = time.time() - start
+
+        losses_np = {k: float(v) for k, v in losses.items()}
+        # logging
+        msg = "Rank: {}, ".format(dist.get_rank())
+        msg += "step: {}, ".format(self.iteration)
+        msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
+                                                  iteration_time)
+        msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                         for k, v in losses_np.items())
+        self.logger.info(msg)
+
+        if dist.get_rank() == 0:
+            for key, value in losses_np.items():
+                self.visualizer.add_scalar(f"train_loss/{key}", value,
+                                           self.iteration)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def valid(self):
+        valid_losses = defaultdict(list)
+        for i, batch in enumerate(self.valid_loader):
+            texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
+            outputs = self.model(
+                texts,
+                text_lens,
+                mels,
+                output_lens,
+                tones=tones,
+                global_condition=utterance_embeds)
+            losses = self.compute_losses(batch, outputs)
+            for key, value in losses.items():
+                valid_losses[key].append(float(value))
+
+            attention_weights = outputs["alignments"]
+            self.visualizer.add_figure(
+                f"valid_sentence_{i}_alignments",
+                display.plot_alignment(attention_weights[0].numpy().T),
+                self.iteration)
+            self.visualizer.add_figure(
+                f"valid_sentence_{i}_target_spectrogram",
+                display.plot_spectrogram(mels[0].numpy().T), self.iteration)
+            mel_pred = outputs['mel_outputs_postnet']
+            self.visualizer.add_figure(
+                f"valid_sentence_{i}_predicted_spectrogram",
+                display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
+
+        # write visual log
+        valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
+
+        # logging
+        msg = "Valid: "
+        msg += "step: {}, ".format(self.iteration)
+        msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                         for k, v in valid_losses.items())
+        self.logger.info(msg)
+
+        for key, value in valid_losses.items():
+            self.visualizer.add_scalar(f"valid/{key}", value, self.iteration)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def eval(self):
+        """Evaluation of Tacotron2 in autoregressive manner."""
+        self.model.eval()
+        mel_dir = Path(self.output_dir / ("eval_{}".format(self.iteration)))
+        mel_dir.mkdir(parents=True, exist_ok=True)
+        for i, batch in enumerate(self.test_loader):
+            texts, tones, mels, utterance_embeds, *_ = batch
+            outputs = self.model.infer(
+                texts, tones=tones, global_condition=utterance_embeds)
+
+            display.plot_alignment(outputs["alignments"][0].numpy().T)
+            plt.savefig(mel_dir / f"sentence_{i}.png")
+            plt.close()
+            np.save(mel_dir / f"sentence_{i}",
+                    outputs["mel_outputs_postnet"][0].numpy().T)
+            print(f"sentence_{i}")
+
+    def setup_model(self):
+        config = self.config
+        model = Tacotron2(
+            vocab_size=config.model.vocab_size,
+            n_tones=config.model.n_tones,
+            d_mels=config.data.d_mels,
+            d_encoder=config.model.d_encoder,
+            encoder_conv_layers=config.model.encoder_conv_layers,
+            encoder_kernel_size=config.model.encoder_kernel_size,
+            d_prenet=config.model.d_prenet,
+            d_attention_rnn=config.model.d_attention_rnn,
+            d_decoder_rnn=config.model.d_decoder_rnn,
+            attention_filters=config.model.attention_filters,
+            attention_kernel_size=config.model.attention_kernel_size,
+            d_attention=config.model.d_attention,
+            d_postnet=config.model.d_postnet,
+            postnet_kernel_size=config.model.postnet_kernel_size,
+            postnet_conv_layers=config.model.postnet_conv_layers,
+            reduction_factor=config.model.reduction_factor,
+            p_encoder_dropout=config.model.p_encoder_dropout,
+            p_prenet_dropout=config.model.p_prenet_dropout,
+            p_attention_dropout=config.model.p_attention_dropout,
+            p_decoder_dropout=config.model.p_decoder_dropout,
+            p_postnet_dropout=config.model.p_postnet_dropout,
+            d_global_condition=config.model.d_global_condition,
+            use_stop_token=config.model.use_stop_token, )
+
+        if self.parallel:
+            model = paddle.DataParallel(model)
+
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(
+            config.training.grad_clip_thresh)
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=config.training.lr,
+            parameters=model.parameters(),
+            weight_decay=paddle.regularizer.L2Decay(
+                config.training.weight_decay),
+            grad_clip=grad_clip)
+        criterion = Tacotron2Loss(
+            use_stop_token_loss=config.model.use_stop_token,
+            use_guided_attention_loss=config.model.use_guided_attention_loss,
+            sigma=config.model.guided_attention_loss_sigma)
+        self.model = model
+        self.optimizer = optimizer
+        self.criterion = criterion
+
+    def setup_dataloader(self):
+        args = self.args
+        config = self.config
+        ljspeech_dataset = AiShell3(args.data)
+
+        valid_set, train_set = dataset.split(ljspeech_dataset,
+                                             config.data.valid_size)
+        batch_fn = collate_aishell3_examples
+
+        if not self.parallel:
+            self.train_loader = DataLoader(
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
+                drop_last=True,
+                collate_fn=batch_fn)
+        else:
+            sampler = DistributedBatchSampler(
+                train_set,
+                batch_size=config.data.batch_size,
+                shuffle=True,
+                drop_last=True)
+            self.train_loader = DataLoader(
+                train_set, batch_sampler=sampler, collate_fn=batch_fn)
+
+        self.valid_loader = DataLoader(
+            valid_set,
+            batch_size=config.data.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=batch_fn)
+
+        self.test_loader = DataLoader(
+            valid_set,
+            batch_size=1,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=batch_fn)
+
+
+def main_sp(config, args):
+    exp = Experiment(config, args)
+    exp.setup()
+    exp.resume_or_load()
+    if not args.test:
+        exp.run()
+    else:
+        exp.eval()
+
+
+def main(config, args):
+    if args.nprocs > 1 and args.device == "gpu":
+        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    else:
+        main_sp(config, args)
+
+
+if __name__ == "__main__":
+    config = get_cfg_defaults()
+    parser = default_argument_parser()
+    parser.add_argument("--test", action="store_true")
+    args = parser.parse_args()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    print(args)
+
+    main(config, args)
--- a/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb
+++ b/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb
--- a/examples/aishell3/vc0/run.sh
+++ b/examples/aishell3/vc0/run.sh
--- a/examples/csmsc/speedyspeech/baker/README.md
+++ b/examples/csmsc/speedyspeech/baker/README.md
@ -0,0 +1,226 @@
+# Speedyspeech with CSMSC
+
+This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner).
+
+## Dataset
+### Download and Extract the datasaet
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+
+### Get MFA result of CSMSC and Extract it
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
+
+## Preprocess the dataset
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to preprocess the dataset.
+```bash
+./preprocess.sh
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+
+The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` sub folder. The raw folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance.
+
+## Train the model
+`./run.sh` calls `../train.py`.
+```bash
+./run.sh
+```
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                     [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                     [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
+                     [--use-relative-path USE_RELATIVE_PATH]
+                     [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
+
+Train a Speedyspeech model with sigle speaker dataset.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --device DEVICE       device type to use.
+  --nprocs NPROCS       number of processes.
+  --verbose VERBOSE     verbose.
+  --use-relative-path USE_RELATIVE_PATH
+                        whether use relative path in metadata
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+  --tones-dict TONES_DICT
+                        tone vocabulary file.
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
+4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
+5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
+6. `--phones-dict` is the path of the phone vocabulary file.
+7. `--tones-dict` is the path of the tone vocabulary file.
+
+## Pretrained Model
+Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
+
+SpeedySpeech checkpoint contains files listed below.
+```text
+speedyspeech_nosil_baker_ckpt_0.5
+├── default.yaml            # default config used to train speedyspeech
+├── feats_stats.npy         # statistics used to normalize spectrogram when training speedyspeech
+├── phone_id_map.txt        # phone vocabulary file when training speedyspeech
+├── snapshot_iter_11400.pdz # model parameters and optimizer states
+└── tone_id_map.txt         # tone vocabulary file when training speedyspeech
+```
+
+## Synthesize
+We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
+Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
+```bash
+unzip pwg_baker_ckpt_0.4.zip
+```
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwg_baker_ckpt_0.4
+├── pwg_default.yaml               # default config used to train parallel wavegan
+├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
+└── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
+```
+`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+./synthesize.sh
+```
+```text
+usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
+                     [--speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT]
+                     [--speedyspeech-stat SPEEDYSPEECH_STAT]
+                     [--pwg-config PWG_CONFIG]
+                     [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
+                     [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
+                     [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
+                     [--inference-dir INFERENCE_DIR] [--device DEVICE]
+                     [--verbose VERBOSE]
+
+Synthesize with speedyspeech & parallel wavegan.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --speedyspeech-config SPEEDYSPEECH_CONFIG
+                        config file for speedyspeech.
+  --speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT
+                        speedyspeech checkpoint to load.
+  --speedyspeech-stat SPEEDYSPEECH_STAT
+                        mean and standard deviation used to normalize
+                        spectrogram when training speedyspeech.
+  --pwg-config PWG_CONFIG
+                        config file for parallelwavegan.
+  --pwg-checkpoint PWG_CHECKPOINT
+                        parallel wavegan generator parameters to load.
+  --pwg-stat PWG_STAT   mean and standard deviation used to normalize
+                        spectrogram when training speedyspeech.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+  --tones-dict TONES_DICT
+                        tone vocabulary file.
+  --test-metadata TEST_METADATA
+                        test metadata
+  --output-dir OUTPUT_DIR
+                        output dir
+  --inference-dir INFERENCE_DIR
+                        dir to save inference models
+  --device DEVICE       device type to use
+  --verbose VERBOSE     verbose
+```
+`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+./synthesize_e2e.sh
+```
+```text
+usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
+                         [--speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT]
+                         [--speedyspeech-stat SPEEDYSPEECH_STAT]
+                         [--pwg-config PWG_CONFIG]
+                         [--pwg-checkpoint PWG_CHECKPOINT]
+                         [--pwg-stat PWG_STAT] [--text TEXT]
+                         [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
+                         [--output-dir OUTPUT_DIR]
+                         [--inference-dir INFERENCE_DIR] [--device DEVICE]
+                         [--verbose VERBOSE]
+
+Synthesize with speedyspeech & parallel wavegan.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --speedyspeech-config SPEEDYSPEECH_CONFIG
+                        config file for speedyspeech.
+  --speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT
+                        speedyspeech checkpoint to load.
+  --speedyspeech-stat SPEEDYSPEECH_STAT
+                        mean and standard deviation used to normalize
+                        spectrogram when training speedyspeech.
+  --pwg-config PWG_CONFIG
+                        config file for parallelwavegan.
+  --pwg-checkpoint PWG_CHECKPOINT
+                        parallel wavegan checkpoint to load.
+  --pwg-stat PWG_STAT   mean and standard deviation used to normalize
+                        spectrogram when training speedyspeech.
+  --text TEXT           text to synthesize, a 'utt_id sentence' pair per line
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+  --tones-dict TONES_DICT
+                        tone vocabulary file.
+  --output-dir OUTPUT_DIR
+                        output dir
+  --inference-dir INFERENCE_DIR
+                        dir to save inference models
+  --device DEVICE       device type to use
+  --verbose VERBOSE     verbose
+```
+1. `--speedyspeech-config`, `--speedyspeech-checkpoint`, `--speedyspeech-stat` are arguments for speedyspeech, which correspond to the 3 files in the speedyspeech pretrained model.
+2. `--pwg-config`, `--pwg-checkpoint`, `--pwg-stat` are arguments for parallel wavegan, which correspond to the 3 files in the parallel wavegan pretrained model.
+3. `--text` is the text file, which contains sentences to synthesize.
+4. `--output-dir` is the directory to save synthesized audio files.
+5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece.
+6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
+6. `--phones-dict` is the path of the phone vocabulary file.
+7. `--tones-dict` is the path of the tone vocabulary file.
+
+You can use the following scripts to synthesize for `../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
+```bash
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 synthesize_e2e.py \
+  --speedyspeech-config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \
+  --speedyspeech-checkpoint=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \
+  --speedyspeech-stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \
+  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+  --text=../sentences.txt \
+  --output-dir=exp/default/test_e2e \
+  --inference-dir=exp/default/inference \
+  --device="gpu" \
+  --phones-dict=speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt \
+  --tones-dict=speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+```
--- a/examples/csmsc/speedyspeech/baker/conf/default.yaml
+++ b/examples/csmsc/speedyspeech/baker/conf/default.yaml
@ -0,0 +1,50 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000           # Sampling rate.
+n_fft: 2048         # FFT size.
+n_shift: 300        # Hop size.
+win_length: 1200    # Window length.
+                    # If set to null, it will be the same as fft_size.
+window: "hann"      # Window function.
+n_mels: 80          # Number of mel basis.
+fmin: 80            # Minimum freq in mel basis calculation.
+fmax: 7600          # Maximum frequency in mel basis calculation.
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 4
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+  encoder_hidden_size: 128
+  encoder_kernel_size: 3
+  encoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
+  duration_predictor_hidden_size: 128
+  decoder_hidden_size: 128
+  decoder_output_size: 80
+  decoder_kernel_size: 3
+  decoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam              # optimizer type
+  learning_rate: 0.002     # learning rate
+  max_grad_norm: 1
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 200
+num_snapshots: 5
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
--- a/examples/csmsc/speedyspeech/baker/inference.py
+++ b/examples/csmsc/speedyspeech/baker/inference.py
@ -0,0 +1,146 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from pathlib import Path
+
+import soundfile as sf
+from paddle import inference
+from parakeet.frontend.zh_frontend import Frontend
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with speedyspeech & parallel wavegan.")
+    parser.add_argument(
+        "--inference-dir", type=str, help="dir to save inference models")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument("--output-dir", type=str, help="output dir")
+    parser.add_argument(
+        "--enable-auto-log", action="store_true", help="use auto log")
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phones.txt",
+        help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones-dict",
+        type=str,
+        default="tones.txt",
+        help="tone vocabulary file.")
+
+    args, _ = parser.parse_known_args()
+
+    frontend = Frontend(
+        phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+    print("frontend done!")
+
+    speedyspeech_config = inference.Config(
+        str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
+        str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
+    speedyspeech_config.enable_use_gpu(100, 0)
+    speedyspeech_config.enable_memory_optim()
+    speedyspeech_predictor = inference.create_predictor(speedyspeech_config)
+
+    pwg_config = inference.Config(
+        str(Path(args.inference_dir) / "pwg.pdmodel"),
+        str(Path(args.inference_dir) / "pwg.pdiparams"))
+    pwg_config.enable_use_gpu(100, 0)
+    pwg_config.enable_memory_optim()
+    pwg_predictor = inference.create_predictor(pwg_config)
+
+    if args.enable_auto_log:
+        import auto_log
+        os.makedirs("output", exist_ok=True)
+        pid = os.getpid()
+        logger = auto_log.AutoLogger(
+            model_name="speedyspeech",
+            model_precision='float32',
+            batch_size=1,
+            data_shape="dynamic",
+            save_path="./output/auto_log.log",
+            inference_config=speedyspeech_config,
+            pids=pid,
+            process_name=None,
+            gpu_ids=0,
+            time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
+            warmup=0)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    sentences = []
+
+    with open(args.text, 'rt') as f:
+        for line in f:
+            utt_id, sentence = line.strip().split()
+            sentences.append((utt_id, sentence))
+
+    for utt_id, sentence in sentences:
+        if args.enable_auto_log:
+            logger.times.start()
+
+        input_ids = frontend.get_input_ids(
+            sentence, merge_sentences=True, get_tone_ids=True)
+        phone_ids = input_ids["phone_ids"]
+        tone_ids = input_ids["tone_ids"]
+        phones = phone_ids[0]
+        tones = tone_ids[0]
+
+        if args.enable_auto_log:
+            logger.times.stamp()
+
+        input_names = speedyspeech_predictor.get_input_names()
+        phones_handle = speedyspeech_predictor.get_input_handle(input_names[0])
+        tones_handle = speedyspeech_predictor.get_input_handle(input_names[1])
+
+        phones_handle.reshape(phones.shape)
+        phones_handle.copy_from_cpu(phones)
+        tones_handle.reshape(tones.shape)
+        tones_handle.copy_from_cpu(tones)
+
+        speedyspeech_predictor.run()
+        output_names = speedyspeech_predictor.get_output_names()
+        output_handle = speedyspeech_predictor.get_output_handle(
+            output_names[0])
+        output_data = output_handle.copy_to_cpu()
+
+        input_names = pwg_predictor.get_input_names()
+        mel_handle = pwg_predictor.get_input_handle(input_names[0])
+        mel_handle.reshape(output_data.shape)
+        mel_handle.copy_from_cpu(output_data)
+
+        pwg_predictor.run()
+        output_names = pwg_predictor.get_output_names()
+        output_handle = pwg_predictor.get_output_handle(output_names[0])
+        wav = output_data = output_handle.copy_to_cpu()
+
+        if args.enable_auto_log:
+            logger.times.stamp()
+
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
+
+        if args.enable_auto_log:
+            logger.times.end(stamp=True)
+        print(f"{utt_id} done!")
+
+    if args.enable_auto_log:
+        logger.report()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/csmsc/speedyspeech/baker/inference.sh
+++ b/examples/csmsc/speedyspeech/baker/inference.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+python3 inference.py \
+  --inference-dir=exp/default/inference \
+  --text=../sentences.txt \
+  --output-dir=exp/default/pd_infer_out \
+  --phones-dict=dump/phone_id_map.txt \
+  --tones-dict=dump/tone_id_map.txt
--- a/examples/csmsc/speedyspeech/baker/preprocess.sh
+++ b/examples/csmsc/speedyspeech/baker/preprocess.sh
@ -0,0 +1,65 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./baker_alignment_tone \
+        --output=durations.txt \
+        --config=conf/default.yaml
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Extract features ..."
+    python3 ../preprocess.py \
+        --dataset=baker \
+        --rootdir=~/datasets/BZNSYP/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=conf/default.yaml \
+        --num-cpu=20 \
+        --cut-sil=True \
+        --use-relative-path=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats" \
+        --use-relative-path=True
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/tone to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --tones-dict=dump/tone_id_map.txt \
+        --use-relative-path=True
+
+    python3 ../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --tones-dict=dump/tone_id_map.txt \
+        --use-relative-path=True
+
+    python3 ../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --tones-dict=dump/tone_id_map.txt \
+        --use-relative-path=True
+
+fi
--- a/examples/csmsc/speedyspeech/baker/run.sh
+++ b/examples/csmsc/speedyspeech/baker/run.sh
@ -0,0 +1,12 @@
+
+#!/bin/bash
+
+python ../train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=conf/default.yaml \
+    --output-dir=exp/default \
+    --nprocs=2 \
+    --phones-dict=dump/phone_id_map.txt \
+    --tones-dict=dump/tone_id_map.txt \
+    --use-relative-path=True
--- a/examples/csmsc/speedyspeech/baker/synthesize.sh
+++ b/examples/csmsc/speedyspeech/baker/synthesize.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ../synthesize.py \
+  --speedyspeech-config=conf/default.yaml \
+  --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
+  --speedyspeech-stat=dump/train/feats_stats.npy \
+  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=exp/default/test \
+  --inference-dir=exp/default/inference \
+  --phones-dict=dump/phone_id_map.txt \
+  --tones-dict=dump/tone_id_map.txt \
+  --device="gpu"
--- a/examples/csmsc/speedyspeech/baker/synthesize_e2e.py
+++ b/examples/csmsc/speedyspeech/baker/synthesize_e2e.py
@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+import paddle
+import yaml
+from paddle import jit
+from paddle.static import InputSpec
+from parakeet.frontend.zh_frontend import Frontend
+from parakeet.models.speedyspeech import SpeedySpeech
+from parakeet.models.speedyspeech import SpeedySpeechInference
+from parakeet.models.parallel_wavegan import PWGGenerator
+from parakeet.models.parallel_wavegan import PWGInference
+from parakeet.modules.normalizer import ZScore
+from yacs.config import CfgNode
+
+
+def evaluate(args, speedyspeech_config, pwg_config):
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for evaluation
+    sentences = []
+    with open(args.text, 'rt') as f:
+        for line in f:
+            utt_id, sentence = line.strip().split()
+            sentences.append((utt_id, sentence))
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+    with open(args.tones_dict, "r") as f:
+        tone_id = [line.strip().split() for line in f.readlines()]
+    tone_size = len(tone_id)
+    print("tone_size:", tone_size)
+
+    model = SpeedySpeech(
+        vocab_size=vocab_size,
+        tone_size=tone_size,
+        **speedyspeech_config["model"])
+    model.set_state_dict(
+        paddle.load(args.speedyspeech_checkpoint)["main_params"])
+    model.eval()
+
+    vocoder = PWGGenerator(**pwg_config["generator_params"])
+    vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    print("model done!")
+
+    stat = np.load(args.speedyspeech_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    speedyspeech_normalizer = ZScore(mu, std)
+
+    stat = np.load(args.pwg_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    pwg_normalizer = ZScore(mu, std)
+
+    speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
+                                                   model)
+    speedyspeech_inference.eval()
+    speedyspeech_inference = jit.to_static(
+        speedyspeech_inference,
+        input_spec=[
+            InputSpec([-1], dtype=paddle.int64), InputSpec(
+                [-1], dtype=paddle.int64)
+        ])
+    paddle.jit.save(speedyspeech_inference,
+                    os.path.join(args.inference_dir, "speedyspeech"))
+    speedyspeech_inference = paddle.jit.load(
+        os.path.join(args.inference_dir, "speedyspeech"))
+
+    pwg_inference = PWGInference(pwg_normalizer, vocoder)
+    pwg_inference.eval()
+    pwg_inference = jit.to_static(
+        pwg_inference, input_spec=[
+            InputSpec([-1, 80], dtype=paddle.float32),
+        ])
+    paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
+    pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
+
+    frontend = Frontend(
+        phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+    print("frontend done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for utt_id, sentence in sentences:
+        input_ids = frontend.get_input_ids(
+            sentence, merge_sentences=True, get_tone_ids=True)
+        phone_ids = input_ids["phone_ids"]
+        tone_ids = input_ids["tone_ids"]
+
+        flags = 0
+        for i in range(len(phone_ids)):
+            part_phone_ids = phone_ids[i]
+            part_tone_ids = tone_ids[i]
+            with paddle.no_grad():
+                mel = speedyspeech_inference(part_phone_ids, part_tone_ids)
+                temp_wav = pwg_inference(mel)
+            if flags == 0:
+                wav = temp_wav
+                flags = 1
+            else:
+                wav = paddle.concat([wav, temp_wav])
+        sf.write(
+            output_dir / (utt_id + ".wav"),
+            wav.numpy(),
+            samplerate=speedyspeech_config.fs)
+        print(f"{utt_id} done!")
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with speedyspeech & parallel wavegan.")
+    parser.add_argument(
+        "--speedyspeech-config", type=str, help="config file for speedyspeech.")
+    parser.add_argument(
+        "--speedyspeech-checkpoint",
+        type=str,
+        help="speedyspeech checkpoint to load.")
+    parser.add_argument(
+        "--speedyspeech-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument(
+        "--pwg-config", type=str, help="config file for parallelwavegan.")
+    parser.add_argument(
+        "--pwg-checkpoint",
+        type=str,
+        help="parallel wavegan checkpoint to load.")
+    parser.add_argument(
+        "--pwg-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones-dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument("--output-dir", type=str, help="output dir")
+    parser.add_argument(
+        "--inference-dir", type=str, help="dir to save inference models")
+    parser.add_argument(
+        "--device", type=str, default="gpu", help="device type to use")
+    parser.add_argument("--verbose", type=int, default=1, help="verbose")
+
+    args, _ = parser.parse_known_args()
+
+    paddle.set_device(args.device)
+
+    with open(args.speedyspeech_config) as f:
+        speedyspeech_config = CfgNode(yaml.safe_load(f))
+    with open(args.pwg_config) as f:
+        pwg_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(speedyspeech_config)
+    print(pwg_config)
+
+    evaluate(args, speedyspeech_config, pwg_config)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh
+++ b/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python synthesize_e2e.py \
+  --speedyspeech-config=conf/default.yaml \
+  --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
+  --speedyspeech-stat=dump/train/feats_stats.npy \
+  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+  --text=../sentences.txt \
+  --output-dir=exp/default/test_e2e \
+  --inference-dir=exp/default/inference \
+  --device="gpu" \
+  --phones-dict=dump/phone_id_map.txt \
+  --tones-dict=dump/tone_id_map.txt
--- a/examples/csmsc/speedyspeech/normalize.py
+++ b/examples/csmsc/speedyspeech/normalize.py
@ -0,0 +1,159 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from parakeet.datasets.data_table import DataTable
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--stats", type=str, required=True, help="statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones-dict", type=str, default=None, help="tone vocabulary file.")
+
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+
+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--use-relative-path",
+        type=str2bool,
+        default=False,
+        help="whether use relative path in metadata")
+    args = parser.parse_args()
+
+    # set logger
+    if args.verbose > 1:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    elif args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+        )
+        logging.warning('Skip DEBUG/INFO messages')
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    if args.use_relative_path:
+        # if use_relative_path in preprocess, covert it to absolute path here
+        metadata_dir = Path(args.metadata).parent
+        for item in metadata:
+            item["feats"] = str(metadata_dir / item["feats"])
+
+    dataset = DataTable(
+        metadata, converters={
+            'feats': np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    scaler = StandardScaler()
+    scaler.mean_ = np.load(args.stats)[0]
+    scaler.scale_ = np.load(args.stats)[1]
+    # from version 0.23.0, this information is needed
+    scaler.n_features_in_ = scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_tones = {}
+    with open(args.tones_dict, 'rt') as f:
+        tone_id = [line.strip().split() for line in f.readlines()]
+    for tone, id in tone_id:
+        vocab_tones[tone] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        mel = item['feats']
+        # normalize
+        mel = scaler.transform(mel)
+
+        # save
+        mel_path = dumpdir / f"{utt_id}_feats.npy"
+        np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        tone_ids = [vocab_tones[p] for p in item['tones']]
+        if args.use_relative_path:
+            # convert absolute path to relative path:
+            mel_path = mel_path.relative_to(dumpdir)
+        output_metadata.append({
+            'utt_id': utt_id,
+            'phones': phone_ids,
+            'tones': tone_ids,
+            'num_phones': item['num_phones'],
+            'num_frames': item['num_frames'],
+            'durations': item['durations'],
+            'feats': str(mel_path),
+        })
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/csmsc/speedyspeech/preprocess.py
+++ b/examples/csmsc/speedyspeech/preprocess.py
@ -0,0 +1,293 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from operator import itemgetter
+from typing import Any
+from typing import Dict
+from typing import List
+
+import argparse
+import jsonlines
+import librosa
+import numpy as np
+import re
+import tqdm
+import yaml
+from concurrent.futures import ThreadPoolExecutor
+from parakeet.data.get_feats import LogMelFBank
+from parakeet.datasets.preprocess_utils import compare_duration_and_mel_length
+from parakeet.datasets.preprocess_utils import get_phones_tones
+from parakeet.datasets.preprocess_utils import get_phn_dur
+from parakeet.datasets.preprocess_utils import merge_silence
+from pathlib import Path
+from yacs.config import CfgNode
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None,
+                     cut_sil: bool=True):
+    utt_id = fp.stem
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(str(fp), sr=config.fs)
+        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+            return record
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+        # little imprecise than use *.TextGrid directly
+        times = librosa.frames_to_time(
+            d_cumsum, sr=config.fs, hop_length=config.n_shift)
+        if cut_sil:
+            start = 0
+            end = d_cumsum[-1]
+            if phones[0] == "sil" and len(durations) > 1:
+                start = times[1]
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                end = times[-2]
+                durations = durations[:-1]
+                phones = phones[:-1]
+            sentences[utt_id][0] = phones
+            sentences[utt_id][1] = durations
+            start, end = librosa.time_to_samples([start, end], sr=config.fs)
+            wav = wav[start:end]
+
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        labels = sentences[utt_id][0]
+        # extract phone and duration
+        phones = []
+        tones = []
+        for label in labels:
+            # split tone from finals
+            match = re.match(r'^(\w+)([012345])$', label)
+            if match:
+                phones.append(match.group(1))
+                tones.append(match.group(2))
+            else:
+                phones.append(label)
+                tones.append('0')
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+        assert sum(durations) == num_frames
+        assert len(phones) == len(tones) == len(durations)
+
+        mel_path = output_dir / (utt_id + "_feats.npy")
+        np.save(mel_path, logmel)  # (num_frames, n_mels)
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "tones": tones,
+            "num_phones": len(phones),
+            "num_frames": num_frames,
+            "durations": durations,
+            "feats": str(mel_path),  # Path object
+        }
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      nprocs: int=1,
+                      cut_sil: bool=True,
+                      use_relative_path: bool=False):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(config, fp, sentences, output_dir,
+                                      mel_extractor, cut_sil)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor,
+                                         cut_sil)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    output_dir = Path(output_dir)
+    metadata_path = output_dir / "metadata.jsonl"
+    # NOTE: use relative path to the meta jsonlines file for Full Chain Project
+    with jsonlines.open(metadata_path, 'w') as writer:
+        for item in results:
+            if use_relative_path:
+                item["feats"] = str(Path(item["feats"]).relative_to(output_dir))
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+
+    parser.add_argument(
+        "--dur-file",
+        default=None,
+        type=str,
+        help="path to baker durations.txt.")
+
+    parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+
+    parser.add_argument(
+        "--verbose",
+        type=int,
+        default=1,
+        help="logging level. higher is more logging. (default=1)")
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--use-relative-path",
+        type=str2bool,
+        default=False,
+        help="whether use relative path in metadata")
+
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    dur_file = Path(args.dur_file).expanduser()
+
+    assert rootdir.is_dir()
+    assert dur_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    if args.verbose > 1:
+        print(vars(args))
+        print(config)
+
+    sentences, speaker_set = get_phn_dur(dur_file)
+
+    merge_silence(sentences)
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    tone_id_map_path = dumpdir / "tone_id_map.txt"
+    get_phones_tones(sentences, phone_id_map_path, tone_id_map_path,
+                     args.dataset)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config,
+            train_wav_files,
+            sentences,
+            train_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            use_relative_path=args.use_relative_path)
+    if dev_wav_files:
+        process_sentences(
+            config,
+            dev_wav_files,
+            sentences,
+            dev_dump_dir,
+            mel_extractor,
+            cut_sil=args.cut_sil,
+            use_relative_path=args.use_relative_path)
+    if test_wav_files:
+        process_sentences(
+            config,
+            test_wav_files,
+            sentences,
+            test_dump_dir,
+            mel_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            use_relative_path=args.use_relative_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/csmsc/speedyspeech/sentences.txt
+++ b/examples/csmsc/speedyspeech/sentences.txt
@ -0,0 +1,16 @@
+001 凯莫瑞安联合体的经济崩溃，迫在眉睫。
+002 对于所有想要离开那片废土，去寻找更美好生活的人来说。
+003 克哈，是你们所有人安全的港湾。
+004 为了保护尤摩扬人民不受异虫的残害，我所做的，比他们自己的领导委员会都多。
+005 无论他们如何诽谤我，我将继续为所有泰伦人的最大利益，而努力奋斗。
+006 身为你们的元首，我带领泰伦人实现了人类统治领地和经济的扩张。
+007 我们将继续成长，用行动回击那些只会说风凉话，不愿意和我们相向而行的害群之马。
+008 帝国武装力量，无数的优秀儿女，正时刻守卫着我们的家园大门，但是他们孤木难支。
+009 凡是今天应征入伍者，所获的所有刑罚罪责，减半。
+010 激进分子和异见者希望你们一听见枪声，就背弃多年的和平与繁荣。
+011 他们没有勇气和能力，带领人类穿越一个充满危险的星系。
+012 法治是我们的命脉，然而它却受到前所未有的挑战。
+013 我将恢复我们帝国的荣光，绝不会向任何外星势力低头。
+014 我已经驯服了异虫，荡平了星灵。如今它们的创造者，想要夺走我们拥有的一切。
+015 永远记住，谁才是最能保护你们的人。
+016 不要听信别人的谗言，我不是什么克隆人。
--- a/Show More
+++ b/Show More