Merge branch 'develop' of github.com:PaddlePaddle/DeepSpeech into merge_parakeet

3 years ago · 2e9d9dc9a7
parent 3ce5dff460 05288cd381
commit 2e9d9dc9a7
85 changed files with 2310 additions and 148 deletions
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -233,7 +233,8 @@ def is_broadcastable(shp1, shp2):
 def masked_fill(xs: paddle.Tensor,
                mask: paddle.Tensor,
                value: Union[float, int]):
-    assert is_broadcastable(xs.shape, mask.shape) is True
+    assert is_broadcastable(xs.shape, mask.shape) is True, (xs.shape,
                                                            mask.shape)
    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
    mask = mask.broadcast_to(bshape)
    trues = paddle.ones_like(xs) * value
@ -315,7 +316,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
    assert len(args) == 1
    if isinstance(args[0], str):  # dtype
        return x.astype(args[0])
-    elif isinstance(args[0], paddle.Tensor):  #Tensor
+    elif isinstance(args[0], paddle.Tensor):  # Tensor
        return x.astype(args[0].dtype)
    else:  # Device
        return x
@ -354,30 +355,153 @@ if not hasattr(paddle.Tensor, 'tolist'):
        "register user tolist to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'tolist', tolist)
 ########### hack paddle.nn #############
 from paddle.nn import Layer
 from typing import Optional
 from typing import Mapping
 from typing import Iterable
 from typing import Tuple
 from typing import Iterator
 from collections import OrderedDict, abc as container_abcs
 # hack loss
 def ctc_loss(logits,
             labels,
             input_lengths,
             label_lengths,
             blank=0,
             reduction='mean',
             norm_by_times=True):
    #logger.info("my ctc loss with norm by times")
    ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
    loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
                                           input_lengths, label_lengths)
    loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
    assert reduction in ['mean', 'sum', 'none']
    if reduction == 'mean':
        loss_out = paddle.mean(loss_out / label_lengths)
    elif reduction == 'sum':
        loss_out = paddle.sum(loss_out)
    return loss_out
 class LayerDict(paddle.nn.Layer):
    r"""Holds submodules in a dictionary.
-logger.debug(
+    :class:`~paddle.nn.LayerDict` can be indexed like a regular Python dictionary,
-    "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
+    but modules it contains are properly registered, and will be visible by all
-)
+    :class:`~paddle.nn.Layer` methods.
-F.ctc_loss = ctc_loss
+
    :class:`~paddle.nn.LayerDict` is an **ordered** dictionary that respects
    * the order of insertion, and
    * in :meth:`~paddle.nn.LayerDict.update`, the order of the merged
      ``OrderedDict``, ``dict`` (started from Python 3.6) or another
      :class:`~paddle.nn.LayerDict` (the argument to
      :meth:`~paddle.nn.LayerDict.update`).
    Note that :meth:`~paddle.nn.LayerDict.update` with other unordered mapping
    types (e.g., Python's plain ``dict`` before Python version 3.6) does not
    preserve the order of the merged mapping.
    Args:
        modules (iterable, optional): a mapping (dictionary) of (string: module)
            or an iterable of key-value pairs of type (string, module)
    Example::
        class MyModule(nn.Layer):
            def __init__(self):
                super(MyModule, self).__init__()
                self.choices = nn.LayerDict({
                        'conv': nn.Conv2d(10, 10, 3),
                        'pool': nn.MaxPool2d(3)
                })
                self.activations = nn.LayerDict([
                        ['lrelu', nn.LeakyReLU()],
                        ['prelu', nn.PReLU()]
                ])
            def forward(self, x, choice, act):
                x = self.choices[choice](x)
                x = self.activations[act](x)
                return x
    """
    def __init__(self, modules: Optional[Mapping[str, Layer]]=None) -> None:
        super(LayerDict, self).__init__()
        if modules is not None:
            self.update(modules)
    def __getitem__(self, key: str) -> Layer:
        return self._modules[key]
    def __setitem__(self, key: str, module: Layer) -> None:
        self.add_module(key, module)
    def __delitem__(self, key: str) -> None:
        del self._modules[key]
    def __len__(self) -> int:
        return len(self._modules)
    def __iter__(self) -> Iterator[str]:
        return iter(self._modules)
    def __contains__(self, key: str) -> bool:
        return key in self._modules
    def clear(self) -> None:
        """Remove all items from the LayerDict.
        """
        self._modules.clear()
    def pop(self, key: str) -> Layer:
        r"""Remove key from the LayerDict and return its module.
        Args:
            key (string): key to pop from the LayerDict
        """
        v = self[key]
        del self[key]
        return v
    def keys(self) -> Iterable[str]:
        r"""Return an iterable of the LayerDict keys.
        """
        return self._modules.keys()
    def items(self) -> Iterable[Tuple[str, Layer]]:
        r"""Return an iterable of the LayerDict key/value pairs.
        """
        return self._modules.items()
    def values(self) -> Iterable[Layer]:
        r"""Return an iterable of the LayerDict values.
        """
        return self._modules.values()
    def update(self, modules: Mapping[str, Layer]) -> None:
        r"""Update the :class:`~paddle.nn.LayerDict` with the key-value pairs from a
        mapping or an iterable, overwriting existing keys.
        .. note::
            If :attr:`modules` is an ``OrderedDict``, a :class:`~paddle.nn.LayerDict`, or
            an iterable of key-value pairs, the order of new elements in it is preserved.
        Args:
            modules (iterable): a mapping (dictionary) from string to :class:`~paddle.nn.Layer`,
                or an iterable of key-value pairs of type (string, :class:`~paddle.nn.Layer`)
        """
        if not isinstance(modules, container_abcs.Iterable):
            raise TypeError("LayerDict.update should be called with an "
                            "iterable of key/value pairs, but got " + type(
                                modules).__name__)
        if isinstance(modules,
                      (OrderedDict, LayerDict, container_abcs.Mapping)):
            for key, module in modules.items():
                self[key] = module
        else:
            # modules here can be a list with two items
            for j, m in enumerate(modules):
                if not isinstance(m, container_abcs.Iterable):
                    raise TypeError("LayerDict update sequence element "
                                    "#" + str(j) + " should be Iterable; is" +
                                    type(m).__name__)
                if not len(m) == 2:
                    raise ValueError("LayerDict update sequence element "
                                     "#" + str(j) + " has length " + str(
                                         len(m)) + "; 2 is required")
                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
                # that's too cumbersome to type correctly with overloads, so we add an ignore here
                self[m[0]] = m[1]  # type: ignore[assignment]
    # remove forward alltogether to fallback on Module's _forward_unimplemented
 if not hasattr(paddle.nn, 'LayerDict'):
    logger.debug(
        "register user LayerDict to paddle.nn, remove this when fixed!")
    setattr(paddle.nn, 'LayerDict', LayerDict)
--- a/deepspeech/decoders/beam_search/init.py
+++ b/deepspeech/decoders/beam_search/init.py
@ -0,0 +1,17 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .batch_beam_search import BatchBeamSearch
 from .beam_search import beam_search
 from .beam_search import BeamSearch
 from .beam_search import Hypothesis
--- a/deepspeech/decoders/beam_search/batch_beam_search.py
+++ b/deepspeech/decoders/beam_search/batch_beam_search.py
@ -0,0 +1,17 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 class BatchBeamSearch():
    pass
--- a/deepspeech/decoders/beam_search/beam_search.py
+++ b/deepspeech/decoders/beam_search/beam_search.py
@ -0,0 +1,530 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Beam search module."""
 from itertools import chain
 from typing import Any
 from typing import Dict
 from typing import List
 from typing import NamedTuple
 from typing import Tuple
 from typing import Union
 import paddle
 from ..scorers.scorer_interface import PartialScorerInterface
 from ..scorers.scorer_interface import ScorerInterface
 from ..utils import end_detect
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 class Hypothesis(NamedTuple):
    """Hypothesis data type."""
    yseq: paddle.Tensor  # (T,)
    score: Union[float, paddle.Tensor] = 0
    scores: Dict[str, Union[float, paddle.Tensor]] = dict()
    states: Dict[str, Any] = dict()
    def asdict(self) -> dict:
        """Convert data to JSON-friendly dict."""
        return self._replace(
            yseq=self.yseq.tolist(),
            score=float(self.score),
            scores={k: float(v)
                    for k, v in self.scores.items()}, )._asdict()
 class BeamSearch(paddle.nn.Layer):
    """Beam search implementation."""
    def __init__(
            self,
            scorers: Dict[str, ScorerInterface],
            weights: Dict[str, float],
            beam_size: int,
            vocab_size: int,
            sos: int,
            eos: int,
            token_list: List[str]=None,
            pre_beam_ratio: float=1.5,
            pre_beam_score_key: str=None, ):
        """Initialize beam search.
        Args:
            scorers (dict[str, ScorerInterface]): Dict of decoder modules
                e.g., Decoder, CTCPrefixScorer, LM
                The scorer will be ignored if it is `None`
            weights (dict[str, float]): Dict of weights for each scorers
                The scorer will be ignored if its weight is 0
            beam_size (int): The number of hypotheses kept during search
            vocab_size (int): The number of vocabulary
            sos (int): Start of sequence id
            eos (int): End of sequence id
            token_list (list[str]): List of tokens for debug log
            pre_beam_score_key (str): key of scores to perform pre-beam search
            pre_beam_ratio (float): beam size in the pre-beam search
                will be `int(pre_beam_ratio * beam_size)`
        """
        super().__init__()
        # set scorers
        self.weights = weights
        self.scorers = dict()  # all = full + partial
        self.full_scorers = dict()  # full tokens
        self.part_scorers = dict()  # partial tokens
        # this module dict is required for recursive cast
        # `self.to(device, dtype)` in `recog.py`
        self.nn_dict = paddle.nn.LayerDict()  # nn.Layer
        for k, v in scorers.items():
            w = weights.get(k, 0)
            if w == 0 or v is None:
                continue
            assert isinstance(
                v, ScorerInterface
            ), f"{k} ({type(v)}) does not implement ScorerInterface"
            self.scorers[k] = v
            if isinstance(v, PartialScorerInterface):
                self.part_scorers[k] = v
            else:
                self.full_scorers[k] = v
            if isinstance(v, paddle.nn.Layer):
                self.nn_dict[k] = v
        # set configurations
        self.sos = sos
        self.eos = eos
        self.token_list = token_list
        # pre_beam_size > beam_size
        self.pre_beam_size = int(pre_beam_ratio * beam_size)
        self.beam_size = beam_size
        self.n_vocab = vocab_size
        if (pre_beam_score_key is not None and pre_beam_score_key != "full" and
                pre_beam_score_key not in self.full_scorers):
            raise KeyError(
                f"{pre_beam_score_key} is not found in {self.full_scorers}")
        # selected `key` scorer to do pre beam search
        self.pre_beam_score_key = pre_beam_score_key
        # do_pre_beam when need, valid and has part_scorers
        self.do_pre_beam = (self.pre_beam_score_key is not None and
                            self.pre_beam_size < self.n_vocab and
                            len(self.part_scorers) > 0)
    def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]:
        """Get an initial hypothesis data.
        Args:
            x (paddle.Tensor): The encoder output feature, (T, D)
        Returns:
            Hypothesis: The initial hypothesis.
        """
        init_states = dict()
        init_scores = dict()
        for k, d in self.scorers.items():
            init_states[k] = d.init_state(x)
            init_scores[k] = 0.0
        return [
            Hypothesis(
                yseq=paddle.to_tensor([self.sos], place=x.place),
                score=0.0,
                scores=init_scores,
                states=init_states, )
        ]
    @staticmethod
    def append_token(xs: paddle.Tensor,
                     x: Union[int, paddle.Tensor]) -> paddle.Tensor:
        """Append new token to prefix tokens.
        Args:
            xs (paddle.Tensor): The prefix token, (T,)
            x (int): The new token to append
        Returns:
            paddle.Tensor: (T+1,), New tensor contains: xs + [x] with xs.dtype and xs.device
        """
        x = paddle.to_tensor([x], dtype=xs.dtype) if isinstance(x, int) else x
        return paddle.concat((xs, x))
    def score_full(self, hyp: Hypothesis, x: paddle.Tensor
                   ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.full_scorers`.
        Args:
            hyp (Hypothesis): Hypothesis with prefix tokens to score
            x (paddle.Tensor): Corresponding input feature, (T, D)
        Returns:
            Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
                score dict of `hyp` that has string keys of `self.full_scorers`
                and tensor score values of shape: `(self.n_vocab,)`,
                and state dict that has string keys
                and state values of `self.full_scorers`
        """
        scores = dict()
        states = dict()
        for k, d in self.full_scorers.items():
            # scores[k] shape (self.n_vocab,)
            scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
        return scores, states
    def score_partial(self,
                      hyp: Hypothesis,
                      ids: paddle.Tensor,
                      x: paddle.Tensor
                      ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.part_scorers`.
        Args:
            hyp (Hypothesis): Hypothesis with prefix tokens to score
            ids (paddle.Tensor): 1D tensor of new partial tokens to score, 
                len(ids) < n_vocab
            x (paddle.Tensor): Corresponding input feature, (T, D)
        Returns:
            Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
                score dict of `hyp` that has string keys of `self.part_scorers`
                and tensor score values of shape: `(len(ids),)`,
                and state dict that has string keys
                and state values of `self.part_scorers`
        """
        scores = dict()
        states = dict()
        for k, d in self.part_scorers.items():
            # scores[k] shape (len(ids),)
            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k],
                                                   x)
        return scores, states
    def beam(self, weighted_scores: paddle.Tensor,
             ids: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Compute topk full token ids and partial token ids.
        Args:
            weighted_scores (paddle.Tensor): The weighted sum scores for each tokens.
                Its shape is `(self.n_vocab,)`.
            ids (paddle.Tensor): The partial token ids(Global) to compute topk.
        Returns:
            Tuple[paddle.Tensor, paddle.Tensor]: 
                The topk full token ids and partial token ids.
                Their shapes are `(self.beam_size,)`.
                i.e. (global ids, global relative local ids).
        """
        # no pre beam performed, `ids` equal to `weighted_scores`
        if weighted_scores.size(0) == ids.size(0):
            top_ids = weighted_scores.topk(
                self.beam_size)[1]  # index in n_vocab
            return top_ids, top_ids
        # mask pruned in pre-beam not to select in topk
        tmp = weighted_scores[ids]
        weighted_scores[:] = -float("inf")
        weighted_scores[ids] = tmp
        # top_ids no equal to local_ids, since ids shape not same
        top_ids = weighted_scores.topk(self.beam_size)[1]  # index in n_vocab
        local_ids = weighted_scores[ids].topk(
            self.beam_size)[1]  # index in len(ids)
        return top_ids, local_ids
    @staticmethod
    def merge_scores(
            prev_scores: Dict[str, float],
            next_full_scores: Dict[str, paddle.Tensor],
            full_idx: int,
            next_part_scores: Dict[str, paddle.Tensor],
            part_idx: int, ) -> Dict[str, paddle.Tensor]:
        """Merge scores for new hypothesis.
        Args:
            prev_scores (Dict[str, float]):
                The previous hypothesis scores by `self.scorers`
            next_full_scores (Dict[str, paddle.Tensor]): scores by `self.full_scorers`
            full_idx (int): The next token id for `next_full_scores`
            next_part_scores (Dict[str, paddle.Tensor]):
                scores of partial tokens by `self.part_scorers`
            part_idx (int): The new token id for `next_part_scores`
        Returns:
            Dict[str, paddle.Tensor]: The new score dict.
                Its keys are names of `self.full_scorers` and `self.part_scorers`.
                Its values are scalar tensors by the scorers.
        """
        new_scores = dict()
        for k, v in next_full_scores.items():
            new_scores[k] = prev_scores[k] + v[full_idx]
        for k, v in next_part_scores.items():
            new_scores[k] = prev_scores[k] + v[part_idx]
        return new_scores
    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
        """Merge states for new hypothesis.
        Args:
            states: states of `self.full_scorers`
            part_states: states of `self.part_scorers`
            part_idx (int): The new token id for `part_scores`
        Returns:
            Dict[str, paddle.Tensor]: The new score dict.
                Its keys are names of `self.full_scorers` and `self.part_scorers`.
                Its values are states of the scorers.
        """
        new_states = dict()
        for k, v in states.items():
            new_states[k] = v
        for k, d in self.part_scorers.items():
            new_states[k] = d.select_state(part_states[k], part_idx)
        return new_states
    def search(self, running_hyps: List[Hypothesis],
               x: paddle.Tensor) -> List[Hypothesis]:
        """Search new tokens for running hypotheses and encoded speech x.
        Args:
            running_hyps (List[Hypothesis]): Running hypotheses on beam
            x (paddle.Tensor): Encoded speech feature (T, D)
        Returns:
            List[Hypotheses]: Best sorted hypotheses
        """
        best_hyps = []
        part_ids = paddle.arange(self.n_vocab)  # no pre-beam
        for hyp in running_hyps:
            # scoring
            weighted_scores = paddle.zeros([self.n_vocab], dtype=x.dtype)
            scores, states = self.score_full(hyp, x)
            for k in self.full_scorers:
                weighted_scores += self.weights[k] * scores[k]
            # partial scoring
            if self.do_pre_beam:
                pre_beam_scores = (weighted_scores
                                   if self.pre_beam_score_key == "full" else
                                   scores[self.pre_beam_score_key])
                part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1]
            part_scores, part_states = self.score_partial(hyp, part_ids, x)
            for k in self.part_scorers:
                weighted_scores[part_ids] += self.weights[k] * part_scores[k]
            # add previous hyp score
            weighted_scores += hyp.score
            # update hyps
            for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
                # `part_j` is `j` relative id in `part_scores`
                # will be (2 x beam at most)
                best_hyps.append(
                    Hypothesis(
                        score=weighted_scores[j],
                        yseq=self.append_token(hyp.yseq, j),
                        scores=self.merge_scores(hyp.scores, scores, j,
                                                 part_scores, part_j),
                        states=self.merge_states(states, part_states, part_j),
                    ))
            # sort and prune 2 x beam -> beam
            best_hyps = sorted(
                best_hyps, key=lambda x: x.score,
                reverse=True)[:min(len(best_hyps), self.beam_size)]
        return best_hyps
    def forward(self,
                x: paddle.Tensor,
                maxlenratio: float=0.0,
                minlenratio: float=0.0) -> List[Hypothesis]:
        """Perform beam search.
        Args:
            x (paddle.Tensor): Encoded speech feature (T, D)
            maxlenratio (float): Input length ratio to obtain max output length.
                If maxlenratio=0.0 (default), it uses a end-detect function
                    to automatically find maximum hypothesis lengths
                If maxlenratio<0.0, its absolute value is interpreted
                    as a constant max output length.
            minlenratio (float): Input length ratio to obtain min output length.
        Returns:
            list[Hypothesis]: N-best decoding results
        """
        # set length bounds
        if maxlenratio == 0:
            maxlen = x.shape[0]
        elif maxlenratio < 0:
            maxlen = -1 * int(maxlenratio)
        else:
            maxlen = max(1, int(maxlenratio * x.size(0)))
        minlen = int(minlenratio * x.size(0))
        logger.info("decoder input length: " + str(x.shape[0]))
        logger.info("max output length: " + str(maxlen))
        logger.info("min output length: " + str(minlen))
        # main loop of prefix search
        running_hyps = self.init_hyp(x)
        ended_hyps = []
        for i in range(maxlen):
            logger.debug("position " + str(i))
            best = self.search(running_hyps, x)
            # post process of one iteration
            running_hyps = self.post_process(i, maxlen, maxlenratio, best,
                                             ended_hyps)
            # end detection
            if maxlenratio == 0.0 and end_detect(
                [h.asdict() for h in ended_hyps], i):
                logger.info(f"end detected at {i}")
                break
            if len(running_hyps) == 0:
                logger.info("no hypothesis. Finish decoding.")
                break
            else:
                logger.debug(f"remained hypotheses: {len(running_hyps)}")
        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
        # check the number of hypotheses reaching to eos
        if len(nbest_hyps) == 0:
            logger.warning("there is no N-best results, perform recognition "
                           "again with smaller minlenratio.")
            return ([] if minlenratio < 0.1 else
                    self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1)))
        # report the best result
        best = nbest_hyps[0]
        for k, v in best.scores.items():
            logger.info(
                f"{float(v):6.2f} * {self.weights[k]:3} = {float(v) * self.weights[k]:6.2f} for {k}"
            )
        logger.info(f"total log probability: {float(best.score):.2f}")
        logger.info(
            f"normalized log probability: {float(best.score) / len(best.yseq):.2f}"
        )
        logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
        if self.token_list is not None:
            # logger.info(
            #     "best hypo: "
            #     + "".join([self.token_list[x] for x in best.yseq[1:-1]])
            #     + "\n"
            # )
            logger.info("best hypo: " + "".join(
                [self.token_list[x] for x in best.yseq[1:]]) + "\n")
        return nbest_hyps
    def post_process(
            self,
            i: int,
            maxlen: int,
            maxlenratio: float,
            running_hyps: List[Hypothesis],
            ended_hyps: List[Hypothesis], ) -> List[Hypothesis]:
        """Perform post-processing of beam search iterations.
        Args:
            i (int): The length of hypothesis tokens.
            maxlen (int): The maximum length of tokens in beam search.
            maxlenratio (int): The maximum length ratio in beam search.
            running_hyps (List[Hypothesis]): The running hypotheses in beam search.
            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
        Returns:
            List[Hypothesis]: The new running hypotheses.
        """
        logger.debug(f"the number of running hypotheses: {len(running_hyps)}")
        if self.token_list is not None:
            logger.debug("best hypo: " + "".join(
                [self.token_list[x] for x in running_hyps[0].yseq[1:]]))
        # add eos in the final loop to avoid that there are no ended hyps
        if i == maxlen - 1:
            logger.info("adding <eos> in the last position in the loop")
            running_hyps = [
                h._replace(yseq=self.append_token(h.yseq, self.eos))
                for h in running_hyps
            ]
        # add ended hypotheses to a final list, and removed them from current hypotheses
        # (this will be a problem, number of hyps < beam)
        remained_hyps = []
        for hyp in running_hyps:
            if hyp.yseq[-1] == self.eos:
                # e.g., Word LM needs to add final <eos> score
                for k, d in chain(self.full_scorers.items(),
                                  self.part_scorers.items()):
                    s = d.final_score(hyp.states[k])
                    hyp.scores[k] += s
                    hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
                ended_hyps.append(hyp)
            else:
                remained_hyps.append(hyp)
        return remained_hyps
 def beam_search(
        x: paddle.Tensor,
        sos: int,
        eos: int,
        beam_size: int,
        vocab_size: int,
        scorers: Dict[str, ScorerInterface],
        weights: Dict[str, float],
        token_list: List[str]=None,
        maxlenratio: float=0.0,
        minlenratio: float=0.0,
        pre_beam_ratio: float=1.5,
        pre_beam_score_key: str="full", ) -> list:
    """Perform beam search with scorers.
    Args:
        x (paddle.Tensor): Encoded speech feature (T, D)
        sos (int): Start of sequence id
        eos (int): End of sequence id
        beam_size (int): The number of hypotheses kept during search
        vocab_size (int): The number of vocabulary
        scorers (dict[str, ScorerInterface]): Dict of decoder modules
            e.g., Decoder, CTCPrefixScorer, LM
            The scorer will be ignored if it is `None`
        weights (dict[str, float]): Dict of weights for each scorers
            The scorer will be ignored if its weight is 0
        token_list (list[str]): List of tokens for debug log
        maxlenratio (float): Input length ratio to obtain max output length.
            If maxlenratio=0.0 (default), it uses a end-detect function
            to automatically find maximum hypothesis lengths
        minlenratio (float): Input length ratio to obtain min output length.
        pre_beam_score_key (str): key of scores to perform pre-beam search
        pre_beam_ratio (float): beam size in the pre-beam search
            will be `int(pre_beam_ratio * beam_size)`
    Returns:
        List[Dict]: N-best decoding results
    """
    ret = BeamSearch(
        scorers,
        weights,
        beam_size=beam_size,
        vocab_size=vocab_size,
        pre_beam_ratio=pre_beam_ratio,
        pre_beam_score_key=pre_beam_score_key,
        sos=sos,
        eos=eos,
        token_list=token_list, ).forward(
            x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
    return [h.asdict() for h in ret]
--- a/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp
+++ b/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp
@ -53,7 +53,8 @@ std::string ctc_greedy_decoder(
    std::string best_path_result;
    for (size_t i = 0; i < idx_vec.size(); ++i) {
        if (idx_vec[i] != blank_id) {
-            best_path_result += vocabulary[idx_vec[i]];
+            std::string ch = vocabulary[idx_vec[i]];
            best_path_result += (ch == kSPACE) ? tSPACE : ch;
        }
    }
    return best_path_result;
--- a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp
+++ b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp
@ -74,7 +74,8 @@ std::vector<std::pair<double, std::string>> get_beam_search_result(
        // convert index to string
        std::string output_str;
        for (size_t j = 0; j < output.size(); j++) {
-            output_str += vocabulary[output[j]];
+            std::string ch = vocabulary[output[j]];
            output_str += (ch == kSPACE) ? tSPACE : ch;
        }
        std::pair<double, std::string> output_pair(
            -space_prefixes[i]->approx_ctc, output_str);
--- a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h
+++ b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h
@ -21,6 +21,7 @@
 #include "path_trie.h"
 const std::string kSPACE = "<space>";
 const std::string tSPACE = " ";
 const float NUM_FLT_INF = std::numeric_limits<float>::max();
 const float NUM_FLT_MIN = std::numeric_limits<float>::min();
--- a/deepspeech/decoders/recog.py
+++ b/deepspeech/decoders/recog.py
@ -0,0 +1,188 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`."""
 import jsonlines
 import paddle
 from yacs.config import CfgNode
 from .beam_search import BatchBeamSearch
 from .beam_search import BeamSearch
 from .scorers.length_bonus import LengthBonus
 from .scorers.scorer_interface import BatchScorerInterface
 from .utils import add_results_to_json
 from deepspeech.exps import dynamic_import_tester
 from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.models.asr_interface import ASRInterface
 from deepspeech.utils.log import Log
 # from espnet.asr.asr_utils import get_model_conf
 # from espnet.asr.asr_utils import torch_load
 # from espnet.nets.lm_interface import dynamic_import_lm
 logger = Log(__name__).getlog()
 # NOTE: you need this func to generate our sphinx doc
 def load_trained_model(args):
    args.nprocs = args.ngpu
    confs = CfgNode()
    confs.set_new_allowed(True)
    confs.merge_from_file(args.model_conf)
    class_obj = dynamic_import_tester(args.model_name)
    exp = class_obj(confs, args)
    with exp.eval():
        exp.setup()
        exp.restore()
    char_list = exp.args.char_list
    model = exp.model
    return model, char_list, exp, confs
 def recog_v2(args):
    """Decode with custom models that implements ScorerInterface.
    Args:
        args (namespace): The program arguments.
        See py:func:`bin.asr_recog.get_parser` for details
    """
    logger.warning("experimental API for custom LMs is selected by --api v2")
    if args.batchsize > 1:
        raise NotImplementedError("multi-utt batch decoding is not implemented")
    if args.streaming_mode is not None:
        raise NotImplementedError("streaming mode is not implemented")
    if args.word_rnnlm:
        raise NotImplementedError("word LM is not implemented")
    # set_deterministic(args)
    model, char_list, exp, confs = load_trained_model(args)
    assert isinstance(model, ASRInterface)
    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=False,
        sort_in_input_length=False,
        preprocess_conf=confs.collator.augmentation_config
        if args.preprocess_conf is None else args.preprocess_conf,
        preprocess_args={"train": False}, )
    if args.rnnlm:
        lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
        # NOTE: for a compatibility with less than 0.5.0 version models
        lm_model_module = getattr(lm_args, "model_module", "default")
        lm_class = dynamic_import_lm(lm_model_module, lm_args.backend)
        lm = lm_class(len(char_list), lm_args)
        torch_load(args.rnnlm, lm)
        lm.eval()
    else:
        lm = None
    if args.ngram_model:
        from .scorers.ngram import NgramFullScorer
        from .scorers.ngram import NgramPartScorer
        if args.ngram_scorer == "full":
            ngram = NgramFullScorer(args.ngram_model, char_list)
        else:
            ngram = NgramPartScorer(args.ngram_model, char_list)
    else:
        ngram = None
    scorers = model.scorers()  # decoder
    scorers["lm"] = lm
    scorers["ngram"] = ngram
    scorers["length_bonus"] = LengthBonus(len(char_list))
    weights = dict(
        decoder=1.0 - args.ctc_weight,
        ctc=args.ctc_weight,
        lm=args.lm_weight,
        ngram=args.ngram_weight,
        length_bonus=args.penalty, )
    beam_search = BeamSearch(
        beam_size=args.beam_size,
        vocab_size=len(char_list),
        weights=weights,
        scorers=scorers,
        sos=model.sos,
        eos=model.eos,
        token_list=char_list,
        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", )
    # TODO(karita): make all scorers batchfied
    if args.batchsize == 1:
        non_batch = [
            k for k, v in beam_search.full_scorers.items()
            if not isinstance(v, BatchScorerInterface)
        ]
        if len(non_batch) == 0:
            beam_search.__class__ = BatchBeamSearch
            logger.info("BatchBeamSearch implementation is selected.")
        else:
            logger.warning(f"As non-batch scorers {non_batch} are found, "
                           f"fall back to non-batch implementation.")
    if args.ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")
    if args.ngpu == 1:
        device = "gpu:0"
    else:
        device = "cpu"
    paddle.set_device(device)
    dtype = getattr(paddle, args.dtype)
    logger.info(f"Decoding device={device}, dtype={dtype}")
    model.to(device=device, dtype=dtype)
    model.eval()
    beam_search.to(device=device, dtype=dtype)
    beam_search.eval()
    # read json data
    js = []
    with jsonlines.open(args.recog_json, "r") as reader:
        for item in reader:
            js.append(item)
    # jsonlines to dict, key by 'utt', value by jsonline
    js = {item['utt']: item for item in js}
    new_js = {}
    with paddle.no_grad():
        with jsonlines.open(args.result_label, "w") as f:
            for idx, name in enumerate(js.keys(), 1):
                logger.info(f"({idx}/{len(js.keys())}) decoding " + name)
                batch = [(name, js[name])]
                feat = load_inputs_and_targets(batch)[0][0]
                logger.info(f'feat: {feat.shape}')
                enc = model.encode(paddle.to_tensor(feat).to(dtype))
                logger.info(f'eout: {enc.shape}')
                nbest_hyps = beam_search(
                    x=enc,
                    maxlenratio=args.maxlenratio,
                    minlenratio=args.minlenratio)
                nbest_hyps = [
                    h.asdict()
                    for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)]
                ]
                new_js[name] = add_results_to_json(js[name], nbest_hyps,
                                                   char_list)
                item = new_js[name]['output'][0]  # 1-best
                ref = item['text']
                rec_text = item['rec_text'].replace('▁', ' ').replace(
                    '<eos>', '').strip()
                rec_tokenid = list(map(int, item['rec_tokenid'].split()))
                f.write({
                    "utt": name,
                    "refs": [ref],
                    "hyps": [rec_text],
                    "hyps_tokenid": [rec_tokenid],
                })
--- a/deepspeech/decoders/recog_bin.py
+++ b/deepspeech/decoders/recog_bin.py
@ -0,0 +1,376 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """End-to-end speech recognition model decoding script."""
 import logging
 import os
 import random
 import sys
 from distutils.util import strtobool
 import configargparse
 import numpy as np
 from deepspeech.decoders.recog import recog_v2
 def get_parser():
    """Get default arguments."""
    parser = configargparse.ArgumentParser(
        description="Transcribe text from speech using "
        "a speech recognition model on one CPU or GPU",
        config_file_parser_class=configargparse.YAMLConfigFileParser,
        formatter_class=configargparse.ArgumentDefaultsHelpFormatter, )
    parser.add(
        '--model-name',
        type=str,
        default='u2_kaldi',
        help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
    # general configuration
    parser.add("--config", is_config_file=True, help="Config file path")
    parser.add(
        "--config2",
        is_config_file=True,
        help="Second config file path that overwrites the settings in `--config`",
    )
    parser.add(
        "--config3",
        is_config_file=True,
        help="Third config file path that overwrites the settings "
        "in `--config` and `--config2`", )
    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
    parser.add_argument(
        "--dtype",
        choices=("float16", "float32", "float64"),
        default="float32",
        help="Float precision (only available in --api v2)", )
    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
    parser.add_argument("--seed", type=int, default=1, help="Random seed")
    parser.add_argument(
        "--verbose", "-V", type=int, default=2, help="Verbose option")
    parser.add_argument(
        "--batchsize",
        type=int,
        default=1,
        help="Batch size for beam search (0: means no batch processing)", )
    parser.add_argument(
        "--preprocess-conf",
        type=str,
        default=None,
        help="The configuration file for the pre-processing", )
    parser.add_argument(
        "--api",
        default="v2",
        choices=["v2"],
        help="Beam search APIs "
        "v2: Experimental API. It supports any models that implements ScorerInterface.",
    )
    # task related
    parser.add_argument(
        "--recog-json", type=str, help="Filename of recognition data (json)")
    parser.add_argument(
        "--result-label",
        type=str,
        required=True,
        help="Filename of result label data (json)", )
    # model (parameter) related
    parser.add_argument(
        "--model",
        type=str,
        required=True,
        help="Model file parameters to read")
    parser.add_argument(
        "--model-conf", type=str, default=None, help="Model config file")
    parser.add_argument(
        "--num-spkrs",
        type=int,
        default=1,
        choices=[1, 2],
        help="Number of speakers in the speech", )
    parser.add_argument(
        "--num-encs",
        default=1,
        type=int,
        help="Number of encoders in the model.")
    # search related
    parser.add_argument(
        "--nbest", type=int, default=1, help="Output N-best hypotheses")
    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
    parser.add_argument(
        "--penalty", type=float, default=0.0, help="Incertion penalty")
    parser.add_argument(
        "--maxlenratio",
        type=float,
        default=0.0,
        help="""Input length ratio to obtain max output length.
                        If maxlenratio=0.0 (default), it uses a end-detect function
                        to automatically find maximum hypothesis lengths.
                        If maxlenratio<0.0, its absolute value is interpreted
                        as a constant max output length""", )
    parser.add_argument(
        "--minlenratio",
        type=float,
        default=0.0,
        help="Input length ratio to obtain min output length", )
    parser.add_argument(
        "--ctc-weight",
        type=float,
        default=0.0,
        help="CTC weight in joint decoding")
    parser.add_argument(
        "--weights-ctc-dec",
        type=float,
        action="append",
        help="ctc weight assigned to each encoder during decoding."
        "[in multi-encoder mode only]", )
    parser.add_argument(
        "--ctc-window-margin",
        type=int,
        default=0,
        help="""Use CTC window with margin parameter to accelerate
                        CTC/attention decoding especially on GPU. Smaller magin
                        makes decoding faster, but may increase search errors.
                        If margin=0 (default), this function is disabled""", )
    # transducer related
    parser.add_argument(
        "--search-type",
        type=str,
        default="default",
        choices=["default", "nsc", "tsd", "alsd", "maes"],
        help="""Type of beam search implementation to use during inference.
        Can be either: default beam search ("default"),
        N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"),
        Alignment-Length Synchronous Decoding ("alsd") or
        modified Adaptive Expansion Search ("maes").""", )
    parser.add_argument(
        "--nstep",
        type=int,
        default=1,
        help="""Number of expansion steps allowed in NSC beam search or mAES
        (nstep > 0 for NSC and nstep > 1 for mAES).""", )
    parser.add_argument(
        "--prefix-alpha",
        type=int,
        default=2,
        help="Length prefix difference allowed in NSC beam search or mAES.", )
    parser.add_argument(
        "--max-sym-exp",
        type=int,
        default=2,
        help="Number of symbol expansions allowed in TSD.", )
    parser.add_argument(
        "--u-max",
        type=int,
        default=400,
        help="Length prefix difference allowed in ALSD.", )
    parser.add_argument(
        "--expansion-gamma",
        type=float,
        default=2.3,
        help="Allowed logp difference for prune-by-value method in mAES.", )
    parser.add_argument(
        "--expansion-beta",
        type=int,
        default=2,
        help="""Number of additional candidates for expanded hypotheses
                selection in mAES.""", )
    parser.add_argument(
        "--score-norm",
        type=strtobool,
        nargs="?",
        default=True,
        help="Normalize final hypotheses' score by length", )
    parser.add_argument(
        "--softmax-temperature",
        type=float,
        default=1.0,
        help="Penalization term for softmax function.", )
    # rnnlm related
    parser.add_argument(
        "--rnnlm", type=str, default=None, help="RNNLM model file to read")
    parser.add_argument(
        "--rnnlm-conf",
        type=str,
        default=None,
        help="RNNLM model config file to read")
    parser.add_argument(
        "--word-rnnlm",
        type=str,
        default=None,
        help="Word RNNLM model file to read")
    parser.add_argument(
        "--word-rnnlm-conf",
        type=str,
        default=None,
        help="Word RNNLM model config file to read", )
    parser.add_argument(
        "--word-dict", type=str, default=None, help="Word list to read")
    parser.add_argument(
        "--lm-weight", type=float, default=0.1, help="RNNLM weight")
    # ngram related
    parser.add_argument(
        "--ngram-model",
        type=str,
        default=None,
        help="ngram model file to read")
    parser.add_argument(
        "--ngram-weight", type=float, default=0.1, help="ngram weight")
    parser.add_argument(
        "--ngram-scorer",
        type=str,
        default="part",
        choices=("full", "part"),
        help="""if the ngram is set as a part scorer, similar with CTC scorer,
                ngram scorer only scores topK hypethesis.
                if the ngram is set as full scorer, ngram scorer scores all hypthesis
                the decoding speed of part scorer is musch faster than full one""",
    )
    # streaming related
    parser.add_argument(
        "--streaming-mode",
        type=str,
        default=None,
        choices=["window", "segment"],
        help="""Use streaming recognizer for inference.
                        `--batchsize` must be set to 0 to enable this mode""", )
    parser.add_argument(
        "--streaming-window", type=int, default=10, help="Window size")
    parser.add_argument(
        "--streaming-min-blank-dur",
        type=int,
        default=10,
        help="Minimum blank duration threshold", )
    parser.add_argument(
        "--streaming-onset-margin", type=int, default=1, help="Onset margin")
    parser.add_argument(
        "--streaming-offset-margin", type=int, default=1, help="Offset margin")
    # non-autoregressive related
    # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail.
    parser.add_argument(
        "--maskctc-n-iterations",
        type=int,
        default=10,
        help="Number of decoding iterations."
        "For Mask CTC, set 0 to predict 1 mask/iter.", )
    parser.add_argument(
        "--maskctc-probability-threshold",
        type=float,
        default=0.999,
        help="Threshold probability for CTC output", )
    # quantize model related
    parser.add_argument(
        "--quantize-config",
        nargs="*",
        help="Quantize config list. E.g.: --quantize-config=[Linear,LSTM,GRU]",
    )
    parser.add_argument(
        "--quantize-dtype",
        type=str,
        default="qint8",
        help="Dtype dynamic quantize")
    parser.add_argument(
        "--quantize-asr-model",
        type=bool,
        default=False,
        help="Quantize asr model", )
    parser.add_argument(
        "--quantize-lm-model",
        type=bool,
        default=False,
        help="Quantize lm model", )
    return parser
 def main(args):
    """Run the main decoding function."""
    parser = get_parser()
    parser.add_argument(
        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
    parser.add_argument(
        "--checkpoint_path", type=str, help="path to load checkpoint")
    parser.add_argument("--dict-path", type=str, help="path to load checkpoint")
    args = parser.parse_args(args)
    if args.ngpu == 0 and args.dtype == "float16":
        raise ValueError(
            f"--dtype {args.dtype} does not support the CPU backend.")
    # logging info
    if args.verbose == 1:
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose == 2:
        logging.basicConfig(
            level=logging.DEBUG,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")
    logging.info(args)
    # check CUDA_VISIBLE_DEVICES
    if args.ngpu > 0:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is None:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
        elif args.ngpu != len(cvd.split(",")):
            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
            sys.exit(1)
        # TODO(mn5k): support of multiple GPUs
        if args.ngpu > 1:
            logging.error("The program only supports ngpu=1.")
            sys.exit(1)
    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
    # seed setting
    random.seed(args.seed)
    np.random.seed(args.seed)
    logging.info("set random seed = %d" % args.seed)
    # validate rnn options
    if args.rnnlm is not None and args.word_rnnlm is not None:
        logging.error(
            "It seems that both --rnnlm and --word-rnnlm are specified. "
            "Please use either option.")
        sys.exit(1)
    # recog
    if args.num_spkrs == 1:
        if args.num_encs == 1:
            # Experimental API that supports custom LMs
            if args.api == "v2":
                recog_v2(args)
            else:
                raise ValueError("Only support --api v2")
        else:
            if args.api == "v2":
                raise NotImplementedError(
                    f"--num-encs {args.num_encs} > 1 is not supported in --api v2"
                )
    elif args.num_spkrs == 2:
        raise ValueError("asr_mix not supported.")
 if __name__ == "__main__":
    main(sys.argv[1:])
--- a/deepspeech/decoders/scorers/init.py
+++ b/deepspeech/decoders/scorers/init.py
--- a/deepspeech/decoders/scorers/ctc.py
+++ b/deepspeech/decoders/scorers/ctc.py
--- a/deepspeech/decoders/scorers/ctc_prefix_score.py
+++ b/deepspeech/decoders/scorers/ctc_prefix_score.py
--- a/deepspeech/decoders/scorers/length_bonus.py
+++ b/deepspeech/decoders/scorers/length_bonus.py
--- a/deepspeech/decoders/scorers/ngram.py
+++ b/deepspeech/decoders/scorers/ngram.py
--- a/deepspeech/decoders/scorers/scorer_interface.py
+++ b/deepspeech/decoders/scorers/scorer_interface.py
@ -145,9 +145,11 @@ class PartialScorerInterface(ScorerInterface):
    and receives pre-pruned next tokens to score because it is too heavy to score
    all the tokens.
    Score sub-set of tokens, not all.
    Examples:
         * Prefix search for connectionist-temporal-classification models
-             * :class:`espnet.nets.scorers.ctc.CTCPrefixScorer`
+             * :class:`decoders.scorers.ctc.CTCPrefixScorer`
    """
--- a/deepspeech/decoders/utils.py
+++ b/deepspeech/decoders/utils.py
@ -11,8 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
-__all__ = ["end_detect"]
+from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ["end_detect", "parse_hypothesis", "add_results_to_json"]
 def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
@ -47,3 +51,79 @@ def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
        return True
    else:
        return False
 # * ------------------ recognition related ------------------ *
 def parse_hypothesis(hyp, char_list):
    """Parse hypothesis.
    Args:
        hyp (list[dict[str, Any]]): Recognition hypothesis.
        char_list (list[str]): List of characters.
    Returns:
        tuple(str, str, str, float)
    """
    # remove sos and get results
    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
    token_as_list = [char_list[idx] for idx in tokenid_as_list]
    score = float(hyp["score"])
    # convert to string
    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
    token = " ".join(token_as_list)
    text = "".join(token_as_list).replace("<space>", " ")
    return text, token, tokenid, score
 def add_results_to_json(js, nbest_hyps, char_list):
    """Add N-best results to json.
    Args:
        js (dict[str, Any]): Groundtruth utterance dict.
        nbest_hyps_sd (list[dict[str, Any]]):
            List of hypothesis for multi_speakers: nutts x nspkrs.
        char_list (list[str]): List of characters.
    Returns:
        dict[str, Any]: N-best results added utterance dict.
    """
    # copy old json info
    new_js = dict()
    new_js["utt2spk"] = js["utt2spk"]
    new_js["output"] = []
    for n, hyp in enumerate(nbest_hyps, 1):
        # parse hypothesis
        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp,
                                                                   char_list)
        # copy ground-truth
        if len(js["output"]) > 0:
            out_dic = dict(js["output"][0].items())
        else:
            # for no reference case (e.g., speech translation)
            out_dic = {"name": ""}
        # update name
        out_dic["name"] += "[%d]" % n
        # add recognition results
        out_dic["rec_text"] = rec_text
        out_dic["rec_token"] = rec_token
        out_dic["rec_tokenid"] = rec_tokenid
        out_dic["score"] = score
        # add to list of N-best result dicts
        new_js["output"].append(out_dic)
        # show 1-best result
        if n == 1:
            if "text" in out_dic.keys():
                logger.info("groundtruth: %s" % out_dic["text"])
            logger.info("prediction : %s" % out_dic["rec_text"])
    return new_js
--- a/deepspeech/exps/init.py
+++ b/deepspeech/exps/init.py
@ -11,3 +11,52 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils.dynamic_import import dynamic_import
 model_trainer_alias = {
    "ds2": "deepspeech.exp.deepspeech2.model:DeepSpeech2Trainer",
    "u2": "deepspeech.exps.u2.model:U2Trainer",
    "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Trainer",
    "u2_st": "deepspeech.exps.u2_st.model:U2STTrainer",
 }
 def dynamic_import_trainer(module):
    """Import Trainer dynamically.
    Args:
        module (str): trainer name. e.g., ds2, u2, u2_kaldi
    Returns:
        type: Trainer class
    """
    model_class = dynamic_import(module, model_trainer_alias)
    assert issubclass(model_class,
                      Trainer), f"{module} does not implement Trainer"
    return model_class
 model_tester_alias = {
    "ds2": "deepspeech.exp.deepspeech2.model:DeepSpeech2Tester",
    "u2": "deepspeech.exps.u2.model:U2Tester",
    "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Tester",
    "u2_st": "deepspeech.exps.u2_st.model:U2STTester",
 }
 def dynamic_import_tester(module):
    """Import Tester dynamically.
    Args:
        module (str): tester name. e.g., ds2, u2, u2_kaldi
    Returns:
        type: Tester class
    """
    model_class = dynamic_import(module, model_tester_alias)
    assert issubclass(model_class,
                      Trainer), f"{module} does not implement Tester"
    return model_class
--- a/deepspeech/exps/deepspeech2/bin/test_hub.py
+++ b/deepspeech/exps/deepspeech2/bin/test_hub.py
@ -56,11 +56,6 @@ class DeepSpeech2Tester_hub():
            cutoff_prob=cfg.cutoff_prob,
            cutoff_top_n=cfg.cutoff_top_n,
            num_processes=cfg.num_proc_bsearch)
        #replace the '<space>' with ' '
        result_transcripts = [
            self._text_featurizer.detokenize(sentence)
            for sentence in result_transcripts
        ]
        return result_transcripts
@ -167,9 +162,8 @@ def check(audio_file):
 def main_sp(config, args):
    exp = DeepSpeech2Tester_hub(config, args)
-    with exp.eval():
+    exp.setup()
-        exp.setup()
+    exp.run_test()
        exp.run_test()
 def main(config, args):
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -341,11 +341,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            cutoff_prob=cfg.cutoff_prob,
            cutoff_top_n=cfg.cutoff_top_n,
            num_processes=cfg.num_proc_bsearch)
        #replace the <space> with ' '
        result_transcripts = [
            self._text_featurizer.detokenize(sentence)
            for sentence in result_transcripts
        ]
        self.autolog.times.stamp()
        self.autolog.times.stamp()
--- a/deepspeech/exps/u2/bin/test_hub.py
+++ b/deepspeech/exps/u2/bin/test_hub.py
@ -0,0 +1,187 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Evaluation for U2 model."""
 import cProfile
 import os
 import sys
 import paddle
 import soundfile
 from deepspeech.exps.u2.config import get_cfg_defaults
 from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.models.u2 import U2Model
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import print_arguments
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
 # TODO(hui zhang): dynamic load
 class U2Tester_Hub(Trainer):
    def __init__(self, config, args):
        # super().__init__(config, args)
        self.args = args
        self.config = config
        self.audio_file = args.audio_file
        self.collate_fn_test = SpeechCollator.from_config(config)
        self._text_featurizer = TextFeaturizer(
            unit_type=config.collator.unit_type,
            vocab_filepath=None,
            spm_model_prefix=config.collator.spm_model_prefix)
    def setup_model(self):
        config = self.config
        model_conf = config.model
        with UpdateConfig(model_conf):
            model_conf.input_dim = self.collate_fn_test.feature_size
            model_conf.output_dim = self.collate_fn_test.vocab_size
        model = U2Model.from_config(model_conf)
        if self.parallel:
            model = paddle.DataParallel(model)
        logger.info(f"{model}")
        layer_tools.print_params(model, logger.info)
        self.model = model
        logger.info("Setup model")
    @mp_tools.rank_zero_only
    @paddle.no_grad()
    def test(self):
        self.model.eval()
        cfg = self.config.decoding
        audio_file = self.audio_file
        collate_fn_test = self.collate_fn_test
        audio, _ = collate_fn_test.process_utterance(
            audio_file=audio_file, transcript="Hello")
        audio_len = audio.shape[0]
        audio = paddle.to_tensor(audio, dtype='float32')
        audio_len = paddle.to_tensor(audio_len)
        audio = paddle.unsqueeze(audio, axis=0)
        vocab_list = collate_fn_test.vocab_list
        text_feature = self.collate_fn_test.text_feature
        result_transcripts = self.model.decode(
            audio,
            audio_len,
            text_feature=text_feature,
            decoding_method=cfg.decoding_method,
            lang_model_path=cfg.lang_model_path,
            beam_alpha=cfg.alpha,
            beam_beta=cfg.beta,
            beam_size=cfg.beam_size,
            cutoff_prob=cfg.cutoff_prob,
            cutoff_top_n=cfg.cutoff_top_n,
            num_processes=cfg.num_proc_bsearch,
            ctc_weight=cfg.ctc_weight,
            decoding_chunk_size=cfg.decoding_chunk_size,
            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
            simulate_streaming=cfg.simulate_streaming)
        logger.info("The result_transcripts: " + result_transcripts[0][0])
    def run_test(self):
        self.resume()
        try:
            self.test()
        except KeyboardInterrupt:
            sys.exit(-1)
    def setup(self):
        """Setup the experiment.
        """
        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        #self.setup_output_dir()
        #self.setup_checkpointer()
        #self.setup_dataloader()
        self.setup_model()
        self.iteration = 0
        self.epoch = 0
    def resume(self):
        """Resume from the checkpoint at checkpoints in the output
        directory or load a specified checkpoint.
        """
        params_path = self.args.checkpoint_path + ".pdparams"
        model_dict = paddle.load(params_path)
        self.model.set_state_dict(model_dict)
 def check(audio_file):
    logger.info("checking the audio file format......")
    try:
        sig, sample_rate = soundfile.read(audio_file)
    except Exception as e:
        logger.error(str(e))
        logger.error(
            "can not open the wav file, please check the audio file format")
        sys.exit(-1)
    logger.info("The sample rate is %d" % sample_rate)
    assert (sample_rate == 16000)
    logger.info("The audio file format is right")
 def main_sp(config, args):
    exp = U2Tester_Hub(config, args)
    with exp.eval():
        exp.setup()
        exp.run_test()
 def main(config, args):
    main_sp(config, args)
 if __name__ == "__main__":
    parser = default_argument_parser()
    # save asr result to
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    parser.add_argument(
        "--audio_file", type=str, help="path of the input audio file")
    args = parser.parse_args()
    print_arguments(args, globals())
    if not os.path.isfile(args.audio_file):
        print("Please input the right audio file path")
        sys.exit(-1)
    check(args.audio_file)
    # https://yaml.org/type/float.html
    config = get_cfg_defaults()
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()
    print(config)
    if args.dump_config:
        with open(args.dump_config, 'w') as f:
            print(config, file=f)
    # Setting for profiling
    pr = cProfile.Profile()
    pr.runcall(main, config, args)
    pr.dump_stats('test.profile')
--- a/deepspeech/exps/u2_kaldi/bin/recog.py
+++ b/deepspeech/exps/u2_kaldi/bin/recog.py
@ -0,0 +1,19 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 from deepspeech.decoders.recog_bin import main
 if __name__ == "__main__":
    main(sys.argv[1:])
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@ -317,11 +317,9 @@ class U2Trainer(Trainer):
        with UpdateConfig(model_conf):
            model_conf.input_dim = self.train_loader.feat_dim
            model_conf.output_dim = self.train_loader.vocab_size
        model = U2Model.from_config(model_conf)
        if self.parallel:
            model = paddle.DataParallel(model)
        logger.info(f"{model}")
        layer_tools.print_params(model, logger.info)
        # lr
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -207,7 +207,7 @@ class TextFeaturizer():
        """Load vocabulary from file."""
        vocab_list = load_dict(vocab_filepath, maskctc)
        assert vocab_list is not None
-        logger.info(f"Vocab: {pformat(vocab_list)}")
+        logger.debug(f"Vocab: {pformat(vocab_list)}")
        id2token = dict(
            [(idx, token) for (idx, token) in enumerate(vocab_list)])
--- a/deepspeech/io/batchfy.py
+++ b/deepspeech/io/batchfy.py
@ -354,7 +354,7 @@ def make_batchset(
    :param int batch_frames_out: maximum number of output frames in a minibatch.
    :param int batch_frames_out: maximum number of input+output frames in a minibatch.
    :param str count: strategy to count maximum size of batch.
-        For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES
+        For choices, see io.batchfy.BATCH_COUNT_CHOICES
    :param int max_length_in: maximum length of input to decide adaptive batch size
    :param int max_length_out: maximum length of output to decide adaptive batch size
--- a/deepspeech/models/asr_interface.py
+++ b/deepspeech/models/asr_interface.py
@ -0,0 +1,161 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ASR Interface module."""
 import argparse
 from deepspeech.utils.dynamic_import import dynamic_import
 class ASRInterface:
    """ASR Interface for ESPnet model implementation."""
    @staticmethod
    def add_arguments(parser):
        """Add arguments to parser."""
        return parser
    @classmethod
    def build(cls, idim: int, odim: int, **kwargs):
        """Initialize this class with python-level args.
        Args:
            idim (int): The number of an input feature dim.
            odim (int): The number of output vocab.
        Returns:
            ASRinterface: A new instance of ASRInterface.
        """
        args = argparse.Namespace(**kwargs)
        return cls(idim, odim, args)
    def forward(self, xs, ilens, ys, olens):
        """Compute loss for training.
        :param xs: batch of padded source sequences paddle.Tensor (B, Tmax, idim)
        :param ilens: batch of lengths of source sequences (B), paddle.Tensor
        :param ys: batch of padded target sequences paddle.Tensor (B, Lmax)
        :param olens: batch of lengths of target sequences (B), paddle.Tensor
        :return: loss value
        :rtype: paddle.Tensor
        """
        raise NotImplementedError("forward method is not implemented")
    def recognize(self, x, recog_args, char_list=None, rnnlm=None):
        """Recognize x for evaluation.
        :param ndarray x: input acouctic feature (B, T, D) or (T, D)
        :param namespace recog_args: argment namespace contraining options
        :param list char_list: list of characters
        :param paddle.nn.Layer rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        raise NotImplementedError("recognize method is not implemented")
    def recognize_batch(self, x, recog_args, char_list=None, rnnlm=None):
        """Beam search implementation for batch.
        :param paddle.Tensor x: encoder hidden state sequences (B, Tmax, Henc)
        :param namespace recog_args: argument namespace containing options
        :param list char_list: list of characters
        :param paddle.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        """
        raise NotImplementedError("Batch decoding is not supported yet.")
    def calculate_all_attentions(self, xs, ilens, ys):
        """Calculate attention.
        :param list xs: list of padded input sequences [(T1, idim), (T2, idim), ...]
        :param ndarray ilens: batch of lengths of input sequences (B)
        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
        :return: attention weights (B, Lmax, Tmax)
        :rtype: float ndarray
        """
        raise NotImplementedError(
            "calculate_all_attentions method is not implemented")
    def calculate_all_ctc_probs(self, xs, ilens, ys):
        """Calculate CTC probability.
        :param list xs_pad: list of padded input sequences [(T1, idim), (T2, idim), ...]
        :param ndarray ilens: batch of lengths of input sequences (B)
        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
        :return: CTC probabilities (B, Tmax, vocab)
        :rtype: float ndarray
        """
        raise NotImplementedError(
            "calculate_all_ctc_probs method is not implemented")
    @property
    def attention_plot_class(self):
        """Get attention plot class."""
        from espnet.asr.asr_utils import PlotAttentionReport
        return PlotAttentionReport
    @property
    def ctc_plot_class(self):
        """Get CTC plot class."""
        from espnet.asr.asr_utils import PlotCTCReport
        return PlotCTCReport
    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        raise NotImplementedError(
            "get_total_subsampling_factor method is not implemented")
    def encode(self, feat):
        """Encode feature in `beam_search` (optional).
        Args:
            x (numpy.ndarray): input feature (T, D)
        Returns:
            paddle.Tensor: encoded feature (T, D)
        """
        raise NotImplementedError("encode method is not implemented")
    def scorers(self):
        """Get scorers for `beam_search` (optional).
        Returns:
            dict[str, ScorerInterface]: dict of `ScorerInterface` objects
        """
        raise NotImplementedError("decoders method is not implemented")
 predefined_asr = {
    "transformer": "deepspeech.models.u2:U2Model",
    "conformer": "deepspeech.models.u2:U2Model",
 }
 def dynamic_import_asr(module):
    """Import ASR models dynamically.
    Args:
        module (str): asr name. e.g., transformer, conformer
    Returns:
        type: ASR class
    """
    model_class = dynamic_import(module, predefined_asr)
    assert issubclass(model_class,
                      ASRInterface), f"{module} does not implement ASRInterface"
    return model_class
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@ -28,8 +28,10 @@ from paddle import jit
 from paddle import nn
 from yacs.config import CfgNode
 from deepspeech.decoders.scorers.ctc import CTCPrefixScorer
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.frontend.utility import load_cmvn
 from deepspeech.models.asr_interface import ASRInterface
 from deepspeech.modules.cmvn import GlobalCMVN
 from deepspeech.modules.ctc import CTCDecoder
 from deepspeech.modules.decoder import TransformerDecoder
@ -55,7 +57,7 @@ __all__ = ["U2Model", "U2InferModel"]
 logger = Log(__name__).getlog()
-class U2BaseModel(nn.Layer):
+class U2BaseModel(ASRInterface, nn.Layer):
    """CTC-Attention hybrid Encoder-Decoder model"""
    @classmethod
@ -120,7 +122,7 @@ class U2BaseModel(nn.Layer):
                 **kwargs):
        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
-        super().__init__()
+        nn.Layer.__init__(self)
        # note that eos is the same as sos (equivalent ID)
        self.sos = vocab_size - 1
        self.eos = vocab_size - 1
@ -813,7 +815,27 @@ class U2BaseModel(nn.Layer):
        return res, res_tokenids
-class U2Model(U2BaseModel):
+class U2DecodeModel(U2BaseModel):
    def scorers(self):
        """Scorers."""
        return dict(
            decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos))
    def encode(self, x):
        """Encode acoustic features.
        :param ndarray x: source acoustic feature (T, D)
        :return: encoder outputs
        :rtype: paddle.Tensor
        """
        self.eval()
        x = paddle.to_tensor(x).unsqueeze(0)
        ilen = x.size(1)
        enc_output, _ = self._forward_encoder(x, ilen)
        return enc_output.squeeze(0)
 class U2Model(U2DecodeModel):
    def __init__(self, configs: dict):
        vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs)
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Union
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
@ -40,7 +42,7 @@ class CTCDecoderBase(nn.Layer):
                 dropout_rate: float=0.0,
                 reduction: bool=True,
                 batch_average: bool=True,
-                 grad_norm_type: str="instance"):
+                 grad_norm_type: Union[str, None]=None):
        """CTC decoder
        Args:
@ -49,7 +51,7 @@ class CTCDecoderBase(nn.Layer):
            dropout_rate (float): dropout rate (0.0 ~ 1.0)
            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
            batch_average (bool): do batch dim wise average.
-            grad_norm_type (str): one of 'instance', 'batch', 'frame', None.
+            grad_norm_type (str): Default, None. one of 'instance', 'batch', 'frame', None. 
        """
        assert check_argument_types()
        super().__init__()
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Decoder definition."""
 from typing import Any
 from typing import List
 from typing import Optional
 from typing import Tuple
@ -20,10 +21,12 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
 from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface
 from deepspeech.modules.attention import MultiHeadedAttention
 from deepspeech.modules.decoder_layer import DecoderLayer
 from deepspeech.modules.embedding import PositionalEncoding
 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.modules.mask import make_xs_mask
 from deepspeech.modules.mask import subsequent_mask
 from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward
 from deepspeech.utils.log import Log
@ -33,7 +36,7 @@ logger = Log(__name__).getlog()
 __all__ = ["TransformerDecoder"]
-class TransformerDecoder(nn.Layer):
+class TransformerDecoder(BatchScorerInterface, nn.Layer):
    """Base class of Transfomer decoder module.
    Args:
        vocab_size: output dim
@ -71,7 +74,8 @@ class TransformerDecoder(nn.Layer):
            concat_after: bool=False, ):
        assert check_argument_types()
-        super().__init__()
+        nn.Layer.__init__(self)
        self.selfattention_layer_type = 'selfattn'
        attention_dim = encoder_output_size
        if input_layer == "embed":
@ -180,3 +184,63 @@ class TransformerDecoder(nn.Layer):
        if self.use_output_layer:
            y = paddle.log_softmax(self.output_layer(y), axis=-1)
        return y, new_cache
    # beam search API (see ScorerInterface)
    def score(self, ys, state, x):
        """Score.
        ys: (ylen,)
        x: (xlen, n_feat)
        """
        ys_mask = subsequent_mask(len(ys)).unsqueeze(0)  # (B,L,L)
        x_mask = make_xs_mask(x.unsqueeze(0)).unsqueeze(1)  # (B,1,T)
        if self.selfattention_layer_type != "selfattn":
            # TODO(karita): implement cache
            logging.warning(
                f"{self.selfattention_layer_type} does not support cached decoding."
            )
            state = None
        logp, state = self.forward_one_step(
            x.unsqueeze(0), x_mask, ys.unsqueeze(0), ys_mask, cache=state)
        return logp.squeeze(0), state
    # batch beam search API (see BatchScorerInterface)
    def batch_score(self,
                    ys: paddle.Tensor,
                    states: List[Any],
                    xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
        """Score new token batch (required).
        Args:
            ys (paddle.Tensor): paddle.int64 prefix tokens (n_batch, ylen).
            states (List[Any]): Scorer states for prefix tokens.
            xs (paddle.Tensor):
                The encoder feature that generates ys (n_batch, xlen, n_feat).
        Returns:
            tuple[paddle.Tensor, List[Any]]: Tuple of
                batchfied scores for next token with shape of `(n_batch, n_vocab)`
                and next state list for ys.
        """
        # merge states
        n_batch = len(ys)
        n_layers = len(self.decoders)
        if states[0] is None:
            batch_state = None
        else:
            # transpose state of [batch, layer] into [layer, batch]
            batch_state = [
                paddle.stack([states[b][i] for b in range(n_batch)])
                for i in range(n_layers)
            ]
        # batch decoding
        ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0)  # (B,L,L)
        xs_mask = make_xs_mask(xs).unsqueeze(1)  # (B,1,T)
        logp, states = self.forward_one_step(
            xs, xs_mask, ys, ys_mask, cache=batch_state)
        # transpose state of [layer, batch] into [batch, layer]
        state_list = [[states[i][b] for i in range(n_layers)]
                      for b in range(n_batch)]
        return logp, state_list
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -54,7 +54,7 @@ class CTCLoss(nn.Layer):
            self.norm_by_total_logits_len = True
        else:
            raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}")
-        self.kwargs = {
+        kwargs = {
            "norm_by_times": self.norm_by_times,
            "norm_by_batchsize": self.norm_by_batchsize,
            "norm_by_total_logits_len": self.norm_by_total_logits_len,
@ -66,10 +66,9 @@ class CTCLoss(nn.Layer):
        except ValueError:
            # Some function, e.g. built-in function, are failed
            param = {}
-        self._kwargs = {k: v for k, v in self.kwargs.items() if k in param}
+        self._kwargs = {k: v for k, v in kwargs.items() if k in param}
-        _notin = {k: v for k, v in self.kwargs.items() if k not in param}
+        _notin = {k: v for k, v in kwargs.items() if k not in param}
        logger.info(f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}")
        #self.loss_fn = partial(self.loss.forward, **_kwargs)
    def forward(self, logits, ys_pad, hlens, ys_lens):
        """Compute CTC loss.
@ -89,8 +88,7 @@ class CTCLoss(nn.Layer):
        # logits: (B, L, D) -> (L, B, D)
        logits = logits.transpose([1, 0, 2])
        ys_pad = ys_pad.astype(paddle.int32)
-        #loss = self.loss_fn(logits, ys_pad, hlens, ys_lens)
+        loss = self.loss(logits, ys_pad, hlens, ys_lens, **self._kwargs)
        loss = self.loss(logits, ys_pad, hlens, ys_lens)
        if self.batch_average:
            # Batch-size average
            loss = loss / B
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@ -18,12 +18,25 @@ from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = [
-    "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
+    "make_xs_mask", "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
    "subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores",
    "mask_finished_preds"
 ]
 def make_xs_mask(xs: paddle.Tensor, pad_value=0.0) -> paddle.Tensor:
    """Maks mask tensor containing indices of non-padded part.
    Args:
        xs (paddle.Tensor): (B, T, D), zeros for pad.
    Returns:
        paddle.Tensor: Mask Tensor indices of non-padded part. (B, T)
    """
    pad_frame = paddle.full([1, 1, xs.shape[-1]], pad_value, dtype=xs.dtype)
    mask = xs != pad_frame
    mask = mask.all(axis=-1)
    return mask
 def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
    """Make mask tensor containing indices of padded part.
    See description of make_non_pad_mask.
@ -31,6 +44,7 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
        lengths (paddle.Tensor): Batch of lengths (B,).
    Returns:
        paddle.Tensor: Mask tensor containing indices of padded part.
        (B, T)
    Examples:
        >>> lengths = [5, 3, 2]
        >>> make_pad_mask(lengths)
@ -62,6 +76,7 @@ def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
        lengths (paddle.Tensor): Batch of lengths (B,).
    Returns:
        paddle.Tensor: mask tensor containing indices of padded part.
        (B, T)
    Examples:
        >>> lengths = [5, 3, 2]
        >>> make_non_pad_mask(lengths)
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@ -35,7 +35,7 @@ class LoadFromFile(argparse.Action):
            parser.parse_args(f.read().split(), namespace)
-def default_argument_parser():
+def default_argument_parser(parser=None):
    r"""A simple yet genral argument parser for experiments with parakeet.
    This is used in examples with parakeet. And it is intended to be used by
@ -62,7 +62,9 @@ def default_argument_parser():
    argparse.ArgumentParser
        the parser
    """
-    parser = argparse.ArgumentParser()
+    if parser is None:
        parser = argparse.ArgumentParser()
    parser.register('action', 'extend', ExtendAction)
    parser.add_argument(
        '--conf', type=open, action=LoadFromFile, help="config file.")
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -126,7 +126,8 @@ class Trainer():
            logger.info(f"Set seed {args.seed}")
        # profiler and benchmark options
-        if self.args.benchmark_batch_size:
+        if hasattr(self.args,
                   "benchmark_batch_size") and self.args.benchmark_batch_size:
            with UpdateConfig(self.config):
                self.config.collator.batch_size = self.args.benchmark_batch_size
                self.config.training.log_interval = 1
@ -326,12 +327,24 @@ class Trainer():
        finally:
            self.destory()
    def restore(self):
        """Resume from latest checkpoint at checkpoints in the output
        directory or load a specified checkpoint.
        If ``args.checkpoint_path`` is not None, load the checkpoint, else
        resume training.
        """
        assert self.args.checkpoint_path
        infos = self.checkpoint.load_latest_parameters(
            self.model, checkpoint_path=self.args.checkpoint_path)
        return infos
    def run_test(self):
        """Do Test/Decode"""
        try:
            with Timer("Test/Decode Done: {}"):
                with self.eval():
-                    self.resume_or_scratch()
+                    self.restore()
                    self.test()
        except KeyboardInterrupt:
            exit(-1)
@ -341,6 +354,7 @@ class Trainer():
        try:
            with Timer("Export Done: {}"):
                with self.eval():
                    self.restore()
                    self.export()
        except KeyboardInterrupt:
            exit(-1)
@ -350,7 +364,7 @@ class Trainer():
        try:
            with Timer("Align Done: {}"):
                with self.eval():
-                    self.resume_or_scratch()
+                    self.restore()
                    self.align()
        except KeyboardInterrupt:
            sys.exit(-1)
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
@ -42,7 +42,7 @@ def all_version():
        "paddle_commit": paddle.version.commit,
        "soundfile": soundfile.__version__,
    }
-    logger.info(f"Deps Module Version:{pformat(vers.items())}")
+    logger.info(f"Deps Module Version:{pformat(list(vers.items()))}")
@contextmanager
--- a/docs/source/asr/deepspeech_architecture.md
+++ b/docs/source/asr/deepspeech_architecture.md
@ -14,10 +14,11 @@ In addition, the training process and the testing process are also introduced.
 The arcitecture of the model is shown in Fig.1.
 <p align="center">
-<img src="../images/ds2onlineModel.png" width=800>
+<img src="../../images/ds2onlineModel.png" width=800>
 <br/>Fig.1 The Arcitecture of deepspeech2 online model
 </p>
 ### Data Preparation
 #### Vocabulary
 For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>.  For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
@ -130,7 +131,7 @@ By using the command above, the training process can be started. There are 5 sta
 Using the command below, you can test the deepspeech2 online model.
 ```
 bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
-```
+ ```
 The detail commands are:
 ```
 conf_path=conf/deepspeech2_online.yaml
@ -152,7 +153,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test export ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
- ```
+```
 After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.
@ -161,12 +162,13 @@ The deepspeech2 offline model is similarity to the deepspeech2 online model. The
 The arcitecture of the model is shown in Fig.2.
 <p align="center">
-<img src="../images/ds2offlineModel.png" width=800>
+<img src="../../images/ds2offlineModel.png" width=800>
 <br/>Fig.2 The Arcitecture of deepspeech2 offline model
 </p>
 For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
 The code of encoder and decoder for deepspeech2 offline model is in:
@ -182,7 +184,7 @@ For training and testing, the "model_type" and the "conf_path" must be set.
 # Training offline
 cd examples/aishell/s0
 bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml
-```
+ ```
 ```
 # Testing offline
 cd examples/aishell/s0
--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@ -2,10 +2,11 @@
 Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle dynamic graph and includes many influential TTS models.  
 <div align="center">
-  <img src="docs/images/logo.png" width=300 /> <br>
+  <img src="../../images/logo.png" width=300 /> <br>
 </div>
-## News  <img src="./docs/images/news_icon.png" width="40"/>
+
 ## News  <img src="../../images/news_icon.png" width="40"/>
 - Oct-12-2021, Refector examples code.
 - Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
 - Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@ -5,7 +5,8 @@
 ## Data
-| Data Subset | Duration in Seconds |
+| Data Subset         | Duration in Seconds   |
-| data/manifest.train |  1.23 ~ 14.53125 |
+| ------------------- | --------------------- |
-| data/manifest.dev  | 1.645 ~ 12.533 |  
+| data/manifest.train | 1.23 ~ 14.53125       |
-| data/manifest.test | 1.859125 ~ 14.6999375 |
+| data/manifest.dev   | 1.645 ~ 12.533        |
 | data/manifest.test  | 1.859125 ~ 14.6999375 |
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -41,7 +41,7 @@ model:
  use_gru: True 
  share_rnn_weights: False
  blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 training:
  n_epoch: 80
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@ -43,7 +43,7 @@ model:
  fc_layers_size_list: -1,
  use_gru: False 
  blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 training:
  n_epoch: 50
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@ -77,7 +77,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@ -72,7 +72,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/aishell/s1/local/test_hub.sh
+++ b/examples/aishell/s1/local/test_hub.sh
@ -0,0 +1,47 @@
 #!/bin/bash
 if [ $# != 3 ];then
    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 ckpt_prefix=$2
 audio_file=$3
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
    chunk_mode=true
 fi
 # download language model
 #bash local/download_lm_ch.sh
 #if [ $? -ne 0 ]; then
 #    exit 1
 #fi
 for type in  attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test_hub.py \
    --nproc ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
    --opts decoding.decoding_method ${type} \
    --opts decoding.batch_size ${batch_size} \
    --audio_file ${audio_file}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
        exit 1
    fi
 done
 exit 0
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@ -13,6 +13,8 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 audio_file="data/tmp.wav"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
@ -48,3 +50,8 @@ fi
    # train lm and build TLG
    ./local/tlg.sh --corpus aishell --lmtype srilm
 fi
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
    # test a single .wav file
    CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -218,6 +218,8 @@ fastspeech2_nosil_aishell3_ckpt_0.4
 ```
 You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@ -2,7 +2,7 @@
 This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of  [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
 1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](../../other/ge2e).
 2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each  sentence in AISHELL-3. This embedding is a extra input of  Tacotron2 which will be concated with encoder outputs.
-3. Vocoder: We use WaveFlow as the neural Vocoder，参考实验 [waveflow](../../ljspeech/voc0).
+3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](../../ljspeech/voc0).
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
--- a/examples/aishell3/vc0/local/preprocess.sh
+++ b/examples/aishell3/vc0/local/preprocess.sh
@ -35,9 +35,3 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --input=${preprocess_path}/normalized_wav \
        --output=${preprocess_path}/mel
 fi
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@ -223,6 +223,8 @@ speedyspeech_nosil_baker_ckpt_0.5
 ```
 You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
 ```bash
 source path.sh
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize_e2e.py \
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@ -202,6 +202,8 @@ fastspeech2_nosil_baker_ckpt_0.4
 ```
 You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize_e2e.py \
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@ -2,8 +2,7 @@
 * s0 is for deepspeech2 offline
 * s1 is for transformer/conformer/U2
-* s2 is for transformer/conformer/U2 w/ kaldi feat
+* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi
 need install Kaldi
 ## Data
 | Data Subset | Duration in Seconds |
--- a/examples/librispeech/s0/README.md
+++ b/examples/librispeech/s0/README.md
@ -1,14 +1,6 @@
 # LibriSpeech
 ## Data
 | Data Subset | Duration in Seconds |
 | --- | --- |
 | data/manifest.train |  0.83s ~ 29.735s |
 | data/manifest.dev | 1.065 ~ 35.155s |  
 | data/manifest.test-clean | 1.285s ~ 34.955s |
 ## Deepspeech2
 | Model | Params | release |  Config | Test set | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- |  
 | DeepSpeech2 | 42.96M | 2.2.0 | conf/deepspeech2.yaml + spec_aug | test-clean | 14.49190807 | 0.067283 |  
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@ -41,7 +41,7 @@ model:
  use_gru: False 
  share_rnn_weights: True
  blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 training:
  n_epoch: 50
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@ -43,7 +43,7 @@ model:
  fc_layers_size_list: 512, 256
  use_gru: False 
  blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 training:
  n_epoch: 50
--- a/examples/librispeech/s0/local/test_hub.sh
+++ b/examples/librispeech/s0/local/test_hub.sh
@ -0,0 +1,36 @@
 #!/bin/bash
 if [ $# != 4 ];then
    echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 ckpt_prefix=$2
 model_type=$3
 audio_file=$4
 # download language model
 bash local/download_lm_en.sh
 if [ $? -ne 0 ]; then
   exit 1
 fi
 python3 -u ${BIN_DIR}/test_hub.py \
 --nproc ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type} \
 --audio_file ${audio_file}
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
    exit 1
 fi
 exit 0
--- a/examples/librispeech/s0/run.sh
+++ b/examples/librispeech/s0/run.sh
@ -13,6 +13,7 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 audio_file="data/tmp.flac"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
@ -37,3 +38,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # export ckpt avg_n
    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test a single .wav file
    CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
 fi
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@ -77,7 +77,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@ -70,7 +70,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@ -73,7 +73,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@ -68,7 +68,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null 
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/librispeech/s1/local/test_hub.sh
+++ b/examples/librispeech/s1/local/test_hub.sh
@ -0,0 +1,54 @@
 #!/bin/bash
 if [ $# != 3 ];then
    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 ckpt_prefix=$2
 audio_file=$3
 # bpemode (unigram or bpe)
 nbpe=5000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
 bpemodel=${bpeprefix}.model
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
    chunk_mode=true
 fi
 # download language model
 #bash local/download_lm_ch.sh
 #if [ $? -ne 0 ]; then
 #    exit 1
 #fi
 for type in attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test_hub.py \
    --nproc ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
    --opts decoding.decoding_method ${type} \
    --opts decoding.batch_size ${batch_size} \
    --audio_file ${audio_file}
    #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
        exit 1
    fi
 done
 exit 0
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
@ -15,6 +15,8 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 audio_file="data/tmp.flac"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
@ -44,3 +46,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # export ckpt avg_n
    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    # test a single .wav file
    CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
--- a/examples/librispeech/s2/README.md
+++ b/examples/librispeech/s2/README.md
@ -1,9 +1,14 @@
 # LibriSpeech
-## Transformer
+| Model | Params | Config | Augmentation| Loss |
-| Model | Params | Config | Augmentation| Test Set | Decode Method | Loss | WER % |  
+| --- | --- | --- | --- |
-| --- | --- | --- | --- | --- | --- | --- | --- |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | 6.3197922706604 |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.395054340362549 | 4.2 |  
+
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.395054340362549 | 5.0 |  
+
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.395054340362549 |  |  
+| Test Set | Decode Method | #Snt | #Wrd | Corr | Sub | Del | Ins | Err | S.Err |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescore | 6.395054340362549 |  |  
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
 | test-clean | attention | 2620 | 52576 | 96.4 | 2.5 | 1.1 | 0.4 | 4.0 | 34.7 |  
 | test-clean | ctc_greedy_search | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 48.0 |  
 | test-clean | ctc_prefix_beamsearch | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 47.6 |  
 | test-clean | attention_rescore | 2620 | 52576 | 96.8 | 2.9 | 0.3 | 0.4 | 3.7 | 38.0 |  
 | test-clean | join_ctc_w/o_lm | 2620 | 52576 | 97.2 | 2.6 | 0.3 | 0.4 | 3.2 | 34.9 |  
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@ -77,7 +77,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@ -70,7 +70,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@ -73,7 +73,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/librispeech/s2/conf/decode/decode.yaml
+++ b/examples/librispeech/s2/conf/decode/decode.yaml
@ -0,0 +1,7 @@
 batchsize: 0
 beam-size: 60
 ctc-weight: 0.0
 lm-weight: 0.0
 maxlenratio: 0.0
 minlenratio: 0.0
 penalty: 0.0
--- a/examples/librispeech/s2/conf/decode/decode_all.yaml
+++ b/examples/librispeech/s2/conf/decode/decode_all.yaml
@ -0,0 +1,7 @@
 batchsize: 0
 beam-size: 60
 ctc-weight: 0.4
 lm-weight: 0.6
 maxlenratio: 0.0
 minlenratio: 0.0
 penalty: 0.0
--- a/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml
+++ b/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml
@ -0,0 +1,7 @@
 batchsize: 0
 beam-size: 60
 ctc-weight: 0.4
 lm-weight: 0.0
 maxlenratio: 0.0
 minlenratio: 0.0
 penalty: 0.0
--- a/examples/librispeech/s2/local/recog.sh
+++ b/examples/librispeech/s2/local/recog.sh
@ -0,0 +1,109 @@
 #!/bin/bash
 set -e
 expdir=exp
 datadir=data
 nj=32
 tag=
 # decode config
 decode_config=conf/decode/decode.yaml
 # lm params
 lang_model=rnnlm.model.best
 lmexpdir=exp/train_rnnlm_pytorch_lm_transformer_cosine_batchsize32_lr1e-4_layer16_unigram5000_ngpu4/
 lmtag='nolm'
 recog_set="test-clean test-other dev-clean dev-other"
 recog_set="test-clean"
 # bpemode (unigram or bpe)
 nbpe=5000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
 bpemodel=${bpeprefix}.model
 # bin params
 config_path=conf/transformer.yaml
 dict=data/bpe_unigram_5000_units.txt
 ckpt_prefix=
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 if [ -z ${ckpt_prefix} ]; then
    echo "usage: $0 --ckpt_prefix ckpt_prefix"
    exit 1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 ckpt_dir=$(dirname `dirname ${ckpt_prefix}`)
 echo "ckpt dir: ${ckpt_dir}"
 ckpt_tag=$(basename ${ckpt_prefix})
 echo "ckpt tag: ${ckpt_tag}"
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
    chunk_mode=true
 fi
 echo "chunk mode: ${chunk_mode}"
 echo "decode conf: ${decode_config}"
 # download language model
 #bash local/download_lm_en.sh
 #if [ $? -ne 0 ]; then
 #    exit 1
 #fi
 pids=() # initialize pids
 for dmethd in join_ctc; do
 (
    echo "${dmethd} decoding"
    for rtask in ${recog_set}; do
    (
        echo "${rtask} dataset"
        decode_dir=${ckpt_dir}/decode/decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag}_${ckpt_tag}_${tag}
        feat_recog_dir=${datadir}
        mkdir -p ${decode_dir}
        mkdir -p ${feat_recog_dir}
        # split data
        split_json.sh manifest.${rtask} ${nj}
        #### use CPU for decoding
        ngpu=0
        # set batchsize 0 to disable batch decoding
        ${decode_cmd} JOB=1:${nj} ${decode_dir}/log/decode.JOB.log \
            python3 -u ${BIN_DIR}/recog.py \
                --api v2 \
                --config ${decode_config} \
                --ngpu ${ngpu} \
                --batchsize 0 \
                --checkpoint_path ${ckpt_prefix} \
                --dict-path ${dict} \
                --recog-json ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} \
                --result-label ${decode_dir}/data.JOB.json \
                --model-conf ${config_path} \
                --model ${ckpt_prefix}.pdparams
                #--rnnlm ${lmexpdir}/${lang_model} \
        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict}
    ) &
    pids+=($!) # store background pids
    i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
    [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." || true
    done
 )
 done
 echo "Finished"
 exit 0
--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
@ -6,7 +6,7 @@ expdir=exp
 datadir=data
 nj=32
-lmtag=
+lmtag='nolm'
 recog_set="test-clean test-other dev-clean dev-other"
 recog_set="test-clean"
@ -17,23 +17,31 @@ bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
 bpemodel=${bpeprefix}.model
-if [ $# != 3 ];then
+config_path=conf/transformer.yaml
-    echo "usage: ${0} config_path dict_path ckpt_path_prefix"
+dict=data/bpe_unigram_5000_units.txt
-    exit -1
+ckpt_prefix=
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 if [ -z ${ckpt_prefix} ]; then
    echo "usage: $0 --ckpt_prefix ckpt_prefix"
    exit 1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-config_path=$1
+ckpt_dir=$(dirname `dirname ${ckpt_prefix}`)
-dict=$2
+echo "ckpt dir: ${ckpt_dir}"
-ckpt_prefix=$3
+
 ckpt_tag=$(basename ${ckpt_prefix})
 echo "ckpt tag: ${ckpt_tag}"
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
    chunk_mode=true
 fi
-echo "chunk mode ${chunk_mode}"
+echo "chunk mode: ${chunk_mode}"
 # download language model
@ -46,13 +54,13 @@ pids=() # initialize pids
 for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
 (
-    echo "${dmethd} decoding"
+    echo "decode method: ${dmethd}"
    for rtask in ${recog_set}; do
    (
-        echo "${rtask} dataset"
+        echo "dataset: ${rtask}"
-        decode_dir=decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag}
+        decode_dir=${ckpt_dir}/decode/decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag}_${ckpt_tag}
        feat_recog_dir=${datadir}
-        mkdir -p ${expdir}/${decode_dir}
+        mkdir -p ${decode_dir}
        mkdir -p ${feat_recog_dir}
        # split data
@ -63,7 +71,7 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
        # set batchsize 0 to disable batch decoding
        batch_size=1
-        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
+        ${decode_cmd} JOB=1:${nj} ${decode_dir}/log/decode.JOB.log \
            python3 -u ${BIN_DIR}/test.py \
            --model-name u2_kaldi \
            --run-mode test \
@ -71,12 +79,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
            --dict-path ${dict} \
            --config ${config_path} \
            --checkpoint_path ${ckpt_prefix} \
-            --result-file ${expdir}/${decode_dir}/data.JOB.json \
+            --result-file ${decode_dir}/data.JOB.json \
            --opts decoding.decoding_method ${dmethd} \
            --opts decoding.batch_size ${batch_size} \
            --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
-        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${expdir}/${decode_dir} ${dict}
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict}
    ) &
    pids+=($!) # store background pids
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 set -e
 . ./path.sh || exit 1;
@ -7,8 +8,9 @@ set -e
 stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
-dict_path=data/train_960_unigram5000_units.txt
+dict_path=data/bpe_unigram_5000_units.txt
 avg_num=10
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 avg_ckpt=avg_${avg_num}
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@ -188,6 +188,8 @@ transformer_tts_ljspeech_ckpt_0.4
 ```
 You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained transformer_tts  and waveflow models.
 ```bash
 source path.sh
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize_e2e.py \
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@ -202,6 +202,8 @@ fastspeech2_nosil_ljspeech_ckpt_0.5
 ```
 You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize_e2e_en.py \
--- a/examples/other/punctuation_restoration/README.md
+++ b/examples/other/punctuation_restoration/README.md
@ -1,3 +1,3 @@
 # Punctation Restoration
-Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask] to do this task.
+Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) to do this task.
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
@ -69,7 +69,7 @@ model:
        asr_weight: 0.0
        ctc_weight: 0.0
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
@ -69,7 +69,7 @@ model:
        asr_weight: 0.5
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@ -67,7 +67,7 @@ model:
    model_conf:
        ctc_weight: 0.5
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@ -42,7 +42,7 @@ model:
  use_gru: False 
  share_rnn_weights: True 
  blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 training:
  n_epoch: 10
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
@ -44,7 +44,7 @@ model:
  fc_layers_size_list: 512, 256
  use_gru: True 
  blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 training:
  n_epoch: 10
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@ -77,7 +77,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@ -70,7 +70,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@ -73,7 +73,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@ -67,7 +67,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@ -212,6 +212,8 @@ fastspeech2_nosil_vctk_ckpt_0.5
 ```
 You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,4 @@
 ConfigArgParse
 coverage
 editdistance
 g2p_en
--- a/tools/Makefile
+++ b/tools/Makefile
@ -6,7 +6,7 @@ CC ?= gcc        # used for sph2pipe
 # CXX = clang++  # Uncomment these lines...
 # CC = clang     # ...to build with Clang.
-$(WGET) ?= wget --no-check-certificate
+WGET ?= wget --no-check-certificate
 .PHONY: all clean
@ -85,7 +85,7 @@ sctk-$(SCTK_GITHASH).tar.gz:
 	if [ -d '$(DOWNLOAD_DIR)' ]; then \
 	  cp -p '$(DOWNLOAD_DIR)/sctk-$(SCTK_GITHASH).tar.gz' .; \
 	else \
-	  $($(WGET)) -nv -T 10 -t 3 -O sctk-$(SCTK_GITHASH).tar.gz \
+	  $(WGET) -nv -T 10 -t 3 -O sctk-$(SCTK_GITHASH).tar.gz \
 	    https://github.com/usnistgov/SCTK/archive/$(SCTK_GITHASH).tar.gz; \
 	fi
--- a/utils/avg_model.py
+++ b/utils/avg_model.py
@ -25,8 +25,8 @@ def main(args):
    paddle.set_device('cpu')
    val_scores = []
-    beat_val_scores = []
+    beat_val_scores = None
-    selected_epochs = []
+    selected_epochs = None
    jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json')
    jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
@ -80,9 +80,10 @@ def main(args):
        data = json.dumps({
            "mode": 'val_best' if args.val_best else 'latest',
            "avg_ckpt": args.dst_model,
-            "ckpt": path_list,
+            "val_loss_mean": np.mean(beat_val_scores),
-            "epoch": selected_epochs.tolist(),
+            "ckpts": path_list,
-            "val_loss": beat_val_scores.tolist(),
+            "epochs": selected_epochs.tolist(),
            "val_losses": beat_val_scores.tolist(),
        })
        f.write(data + "\n")
`@ -1,3 +1,3 @@`
	`# Punctation Restoration`	`# Punctation Restoration`

	`Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask] to do this task.`	`Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) to do this task.`