recog into decoders, format code

4 years ago · dfd80b3aa2
parent ee6446a3aa
commit dfd80b3aa2
25 changed files with 808 additions and 683 deletions
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -233,7 +233,8 @@ def is_broadcastable(shp1, shp2):
 def masked_fill(xs: paddle.Tensor,
                mask: paddle.Tensor,
                value: Union[float, int]):
-    assert is_broadcastable(xs.shape, mask.shape) is True, (xs.shape, mask.shape)
+    assert is_broadcastable(xs.shape, mask.shape) is True, (xs.shape,
+                                                            mask.shape)
    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
    mask = mask.broadcast_to(bshape)
    trues = paddle.ones_like(xs) * value
@ -312,18 +313,18 @@ if not hasattr(paddle.Tensor, 'type_as'):


 def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-   assert len(args) == 1
-   if isinstance(args[0], str):  # dtype
-       return x.astype(args[0])
-   elif isinstance(args[0], paddle.Tensor):  # Tensor
-       return x.astype(args[0].dtype)
-   else:  # Device
-       return x
+    assert len(args) == 1
+    if isinstance(args[0], str):  # dtype
+        return x.astype(args[0])
+    elif isinstance(args[0], paddle.Tensor):  # Tensor
+        return x.astype(args[0].dtype)
+    else:  # Device
+        return x


 if not hasattr(paddle.Tensor, 'to'):
-   logger.debug("register user to to paddle.Tensor, remove this when fixed!")
-   setattr(paddle.Tensor, 'to', to)
+    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'to', to)


 def func_float(x: paddle.Tensor) -> paddle.Tensor:
@ -355,7 +356,6 @@ if not hasattr(paddle.Tensor, 'tolist'):
    setattr(paddle.Tensor, 'tolist', tolist)


-
 ########### hcak paddle.nn.functional #############
 # hack loss
 def ctc_loss(logits,
@ -384,7 +384,6 @@ logger.debug(
 )
 F.ctc_loss = ctc_loss

-
 ########### hcak paddle.nn #############
 from paddle.nn import Layer
 from typing import Optional
@ -394,6 +393,7 @@ from typing import Tuple
 from typing import Iterator
 from collections import OrderedDict, abc as container_abcs

+
 class LayerDict(paddle.nn.Layer):
    r"""Holds submodules in a dictionary.

@ -438,7 +438,7 @@ class LayerDict(paddle.nn.Layer):
                return x
    """

-    def __init__(self, modules: Optional[Mapping[str, Layer]] = None) -> None:
+    def __init__(self, modules: Optional[Mapping[str, Layer]]=None) -> None:
        super(LayerDict, self).__init__()
        if modules is not None:
            self.update(modules)
@ -505,10 +505,11 @@ class LayerDict(paddle.nn.Layer):
        """
        if not isinstance(modules, container_abcs.Iterable):
            raise TypeError("LayerDict.update should be called with an "
-                            "iterable of key/value pairs, but got " +
-                            type(modules).__name__)
+                            "iterable of key/value pairs, but got " + type(
+                                modules).__name__)

-        if isinstance(modules, (OrderedDict, LayerDict, container_abcs.Mapping)):
+        if isinstance(modules,
+                      (OrderedDict, LayerDict, container_abcs.Mapping)):
            for key, module in modules.items():
                self[key] = module
        else:
@ -520,14 +521,15 @@ class LayerDict(paddle.nn.Layer):
                                    type(m).__name__)
                if not len(m) == 2:
                    raise ValueError("LayerDict update sequence element "
-                                     "#" + str(j) + " has length " + str(len(m)) +
-                                     "; 2 is required")
+                                     "#" + str(j) + " has length " + str(
+                                         len(m)) + "; 2 is required")
                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
                # that's too cumbersome to type correctly with overloads, so we add an ignore here
                self[m[0]] = m[1]  # type: ignore[assignment]

    # remove forward alltogether to fallback on Module's _forward_unimplemented

+
 if not hasattr(paddle.nn, 'LayerDict'):
    logger.debug(
        "register user LayerDict to paddle.nn, remove this when fixed!")
--- a/deepspeech/decoders/beam_search/init.py
+++ b/deepspeech/decoders/beam_search/init.py
@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .batch_beam_search import BatchBeamSearch
+from .beam_search import beam_search
+from .beam_search import BeamSearch
+from .beam_search import Hypothesis
--- a/deepspeech/decoders/beam_search/batch_beam_search.py
+++ b/deepspeech/decoders/beam_search/batch_beam_search.py
@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class BatchBeamSearch():
+    pass
--- a/deepspeech/decoders/beam_search/beam_search.py
+++ b/deepspeech/decoders/beam_search/beam_search.py
@ -1,5 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Beam search module."""
-
 from itertools import chain
 from typing import Any
 from typing import Dict
@ -10,18 +22,18 @@ from typing import Union

 import paddle

-from .utils import end_detect
-from .scorers.scorer_interface import PartialScorerInterface
-from .scorers.scorer_interface import ScorerInterface
-
+from ..scorers.scorer_interface import PartialScorerInterface
+from ..scorers.scorer_interface import ScorerInterface
+from ..utils import end_detect
 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()

+
 class Hypothesis(NamedTuple):
    """Hypothesis data type."""

-    yseq: paddle.Tensor # (T,)
+    yseq: paddle.Tensor  # (T,)
    score: Union[float, paddle.Tensor] = 0
    scores: Dict[str, Union[float, paddle.Tensor]] = dict()
    states: Dict[str, Any] = dict()
@ -31,25 +43,24 @@ class Hypothesis(NamedTuple):
        return self._replace(
            yseq=self.yseq.tolist(),
            score=float(self.score),
-            scores={k: float(v) for k, v in self.scores.items()},
-        )._asdict()
+            scores={k: float(v)
+                    for k, v in self.scores.items()}, )._asdict()


 class BeamSearch(paddle.nn.Layer):
    """Beam search implementation."""

    def __init__(
-        self,
-        scorers: Dict[str, ScorerInterface],
-        weights: Dict[str, float],
-        beam_size: int,
-        vocab_size: int,
-        sos: int,
-        eos: int,
-        token_list: List[str] = None,
-        pre_beam_ratio: float = 1.5,
-        pre_beam_score_key: str = None,
-    ):
+            self,
+            scorers: Dict[str, ScorerInterface],
+            weights: Dict[str, float],
+            beam_size: int,
+            vocab_size: int,
+            sos: int,
+            eos: int,
+            token_list: List[str]=None,
+            pre_beam_ratio: float=1.5,
+            pre_beam_score_key: str=None, ):
        """Initialize beam search.

        Args:
@ -71,12 +82,12 @@ class BeamSearch(paddle.nn.Layer):
        super().__init__()
        # set scorers
        self.weights = weights
-        self.scorers = dict() # all = full + partial
-        self.full_scorers = dict() # full tokens
-        self.part_scorers = dict() # partial tokens
+        self.scorers = dict()  # all = full + partial
+        self.full_scorers = dict()  # full tokens
+        self.part_scorers = dict()  # partial tokens
        # this module dict is required for recursive cast
        # `self.to(device, dtype)` in `recog.py`
-        self.nn_dict = paddle.nn.LayerDict() # nn.Layer
+        self.nn_dict = paddle.nn.LayerDict()  # nn.Layer
        for k, v in scorers.items():
            w = weights.get(k, 0)
            if w == 0 or v is None:
@ -100,20 +111,16 @@ class BeamSearch(paddle.nn.Layer):
        self.pre_beam_size = int(pre_beam_ratio * beam_size)
        self.beam_size = beam_size
        self.n_vocab = vocab_size
-        if (
-            pre_beam_score_key is not None
-            and pre_beam_score_key != "full"
-            and pre_beam_score_key not in self.full_scorers
-        ):
-            raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
+        if (pre_beam_score_key is not None and pre_beam_score_key != "full" and
+                pre_beam_score_key not in self.full_scorers):
+            raise KeyError(
+                f"{pre_beam_score_key} is not found in {self.full_scorers}")
        # selected `key` scorer to do pre beam search
        self.pre_beam_score_key = pre_beam_score_key
        # do_pre_beam when need, valid and has part_scorers
-        self.do_pre_beam = (
-            self.pre_beam_score_key is not None
-            and self.pre_beam_size < self.n_vocab
-            and len(self.part_scorers) > 0
-        )
+        self.do_pre_beam = (self.pre_beam_score_key is not None and
+                            self.pre_beam_size < self.n_vocab and
+                            len(self.part_scorers) > 0)

    def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]:
        """Get an initial hypothesis data.
@ -135,12 +142,12 @@ class BeamSearch(paddle.nn.Layer):
                yseq=paddle.to_tensor([self.sos], place=x.place),
                score=0.0,
                scores=init_scores,
-                states=init_states,
-            )
+                states=init_states, )
        ]

    @staticmethod
-    def append_token(xs: paddle.Tensor, x: Union[int, paddle.Tensor]) -> paddle.Tensor:
+    def append_token(xs: paddle.Tensor,
+                     x: Union[int, paddle.Tensor]) -> paddle.Tensor:
        """Append new token to prefix tokens.

        Args:
@ -154,9 +161,8 @@ class BeamSearch(paddle.nn.Layer):
        x = paddle.to_tensor([x], dtype=xs.dtype) if isinstance(x, int) else x
        return paddle.concat((xs, x))

-    def score_full(
-        self, hyp: Hypothesis, x: paddle.Tensor
-    ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
+    def score_full(self, hyp: Hypothesis, x: paddle.Tensor
+                   ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.full_scorers`.

        Args:
@ -178,9 +184,11 @@ class BeamSearch(paddle.nn.Layer):
            scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
        return scores, states

-    def score_partial(
-        self, hyp: Hypothesis, ids: paddle.Tensor, x: paddle.Tensor
-    ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
+    def score_partial(self,
+                      hyp: Hypothesis,
+                      ids: paddle.Tensor,
+                      x: paddle.Tensor
+                      ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.part_scorers`.

        Args:
@ -201,12 +209,12 @@ class BeamSearch(paddle.nn.Layer):
        states = dict()
        for k, d in self.part_scorers.items():
            # scores[k] shape (len(ids),)
-            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
+            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k],
+                                                   x)
        return scores, states

-    def beam(
-        self, weighted_scores: paddle.Tensor, ids: paddle.Tensor
-    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+    def beam(self, weighted_scores: paddle.Tensor,
+             ids: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Compute topk full token ids and partial token ids.

        Args:
@ -223,7 +231,8 @@ class BeamSearch(paddle.nn.Layer):
        """
        # no pre beam performed, `ids` equal to `weighted_scores`
        if weighted_scores.size(0) == ids.size(0):
-            top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
+            top_ids = weighted_scores.topk(
+                self.beam_size)[1]  # index in n_vocab
            return top_ids, top_ids

        # mask pruned in pre-beam not to select in topk
@ -231,18 +240,18 @@ class BeamSearch(paddle.nn.Layer):
        weighted_scores[:] = -float("inf")
        weighted_scores[ids] = tmp
        # top_ids no equal to local_ids, since ids shape not same
-        top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
-        local_ids = weighted_scores[ids].topk(self.beam_size)[1] # index in len(ids)
+        top_ids = weighted_scores.topk(self.beam_size)[1]  # index in n_vocab
+        local_ids = weighted_scores[ids].topk(
+            self.beam_size)[1]  # index in len(ids)
        return top_ids, local_ids

    @staticmethod
    def merge_scores(
-        prev_scores: Dict[str, float],
-        next_full_scores: Dict[str, paddle.Tensor],
-        full_idx: int,
-        next_part_scores: Dict[str, paddle.Tensor],
-        part_idx: int,
-    ) -> Dict[str, paddle.Tensor]:
+            prev_scores: Dict[str, float],
+            next_full_scores: Dict[str, paddle.Tensor],
+            full_idx: int,
+            next_part_scores: Dict[str, paddle.Tensor],
+            part_idx: int, ) -> Dict[str, paddle.Tensor]:
        """Merge scores for new hypothesis.

        Args:
@ -288,9 +297,8 @@ class BeamSearch(paddle.nn.Layer):
            new_states[k] = d.select_state(part_states[k], part_idx)
        return new_states

-    def search(
-        self, running_hyps: List[Hypothesis], x: paddle.Tensor
-    ) -> List[Hypothesis]:
+    def search(self, running_hyps: List[Hypothesis],
+               x: paddle.Tensor) -> List[Hypothesis]:
        """Search new tokens for running hypotheses and encoded speech x.

        Args:
@ -311,11 +319,9 @@ class BeamSearch(paddle.nn.Layer):
                weighted_scores += self.weights[k] * scores[k]
            # partial scoring
            if self.do_pre_beam:
-                pre_beam_scores = (
-                    weighted_scores
-                    if self.pre_beam_score_key == "full"
-                    else scores[self.pre_beam_score_key]
-                )
+                pre_beam_scores = (weighted_scores
+                                   if self.pre_beam_score_key == "full" else
+                                   scores[self.pre_beam_score_key])
                part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1]
            part_scores, part_states = self.score_partial(hyp, part_ids, x)
            for k in self.part_scorers:
@ -331,22 +337,21 @@ class BeamSearch(paddle.nn.Layer):
                    Hypothesis(
                        score=weighted_scores[j],
                        yseq=self.append_token(hyp.yseq, j),
-                        scores=self.merge_scores(
-                            hyp.scores, scores, j, part_scores, part_j
-                        ),
+                        scores=self.merge_scores(hyp.scores, scores, j,
+                                                 part_scores, part_j),
                        states=self.merge_states(states, part_states, part_j),
-                    )
-                )
+                    ))

            # sort and prune 2 x beam -> beam
-            best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
-                : min(len(best_hyps), self.beam_size)
-            ]
+            best_hyps = sorted(
+                best_hyps, key=lambda x: x.score,
+                reverse=True)[:min(len(best_hyps), self.beam_size)]
        return best_hyps

-    def forward(
-        self, x: paddle.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
-    ) -> List[Hypothesis]:
+    def forward(self,
+                x: paddle.Tensor,
+                maxlenratio: float=0.0,
+                minlenratio: float=0.0) -> List[Hypothesis]:
        """Perform beam search.

        Args:
@ -381,9 +386,11 @@ class BeamSearch(paddle.nn.Layer):
            logger.debug("position " + str(i))
            best = self.search(running_hyps, x)
            # post process of one iteration
-            running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
+            running_hyps = self.post_process(i, maxlen, maxlenratio, best,
+                                             ended_hyps)
            # end detection
-            if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
+            if maxlenratio == 0.0 and end_detect(
+                [h.asdict() for h in ended_hyps], i):
                logger.info(f"end detected at {i}")
                break
            if len(running_hyps) == 0:
@ -395,15 +402,10 @@ class BeamSearch(paddle.nn.Layer):
        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
        # check the number of hypotheses reaching to eos
        if len(nbest_hyps) == 0:
-            logger.warning(
-                "there is no N-best results, perform recognition "
-                "again with smaller minlenratio."
-            )
-            return (
-                []
-                if minlenratio < 0.1
-                else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
-            )
+            logger.warning("there is no N-best results, perform recognition "
+                           "again with smaller minlenratio.")
+            return ([] if minlenratio < 0.1 else
+                    self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1)))

        # report the best result
        best = nbest_hyps[0]
@ -412,7 +414,9 @@ class BeamSearch(paddle.nn.Layer):
                f"{float(v):6.2f} * {self.weights[k]:3} = {float(v) * self.weights[k]:6.2f} for {k}"
            )
        logger.info(f"total log probability: {float(best.score):.2f}")
-        logger.info(f"normalized log probability: {float(best.score) / len(best.yseq):.2f}")
+        logger.info(
+            f"normalized log probability: {float(best.score) / len(best.yseq):.2f}"
+        )
        logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
        if self.token_list is not None:
            # logger.info(
@ -420,21 +424,17 @@ class BeamSearch(paddle.nn.Layer):
            #     + "".join([self.token_list[x] for x in best.yseq[1:-1]])
            #     + "\n"
            # )
-            logger.info(
-                "best hypo: "
-                + "".join([self.token_list[x] for x in best.yseq[1:]])
-                + "\n"
-            )
+            logger.info("best hypo: " + "".join(
+                [self.token_list[x] for x in best.yseq[1:]]) + "\n")
        return nbest_hyps

    def post_process(
-        self,
-        i: int,
-        maxlen: int,
-        maxlenratio: float,
-        running_hyps: List[Hypothesis],
-        ended_hyps: List[Hypothesis],
-    ) -> List[Hypothesis]:
+            self,
+            i: int,
+            maxlen: int,
+            maxlenratio: float,
+            running_hyps: List[Hypothesis],
+            ended_hyps: List[Hypothesis], ) -> List[Hypothesis]:
        """Perform post-processing of beam search iterations.

        Args:
@ -450,10 +450,8 @@ class BeamSearch(paddle.nn.Layer):
        """
        logger.debug(f"the number of running hypotheses: {len(running_hyps)}")
        if self.token_list is not None:
-            logger.debug(
-                "best hypo: "
-                + "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]])
-            )
+            logger.debug("best hypo: " + "".join(
+                [self.token_list[x] for x in running_hyps[0].yseq[1:]]))
        # add eos in the final loop to avoid that there are no ended hyps
        if i == maxlen - 1:
            logger.info("adding <eos> in the last position in the loop")
@ -468,7 +466,8 @@ class BeamSearch(paddle.nn.Layer):
        for hyp in running_hyps:
            if hyp.yseq[-1] == self.eos:
                # e.g., Word LM needs to add final <eos> score
-                for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
+                for k, d in chain(self.full_scorers.items(),
+                                  self.part_scorers.items()):
                    s = d.final_score(hyp.states[k])
                    hyp.scores[k] += s
                    hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
@ -479,19 +478,18 @@ class BeamSearch(paddle.nn.Layer):


 def beam_search(
-    x: paddle.Tensor,
-    sos: int,
-    eos: int,
-    beam_size: int,
-    vocab_size: int,
-    scorers: Dict[str, ScorerInterface],
-    weights: Dict[str, float],
-    token_list: List[str] = None,
-    maxlenratio: float = 0.0,
-    minlenratio: float = 0.0,
-    pre_beam_ratio: float = 1.5,
-    pre_beam_score_key: str = "full",
-) -> list:
+        x: paddle.Tensor,
+        sos: int,
+        eos: int,
+        beam_size: int,
+        vocab_size: int,
+        scorers: Dict[str, ScorerInterface],
+        weights: Dict[str, float],
+        token_list: List[str]=None,
+        maxlenratio: float=0.0,
+        minlenratio: float=0.0,
+        pre_beam_ratio: float=1.5,
+        pre_beam_score_key: str="full", ) -> list:
    """Perform beam search with scorers.

    Args:
@ -527,6 +525,6 @@ def beam_search(
        pre_beam_score_key=pre_beam_score_key,
        sos=sos,
        eos=eos,
-        token_list=token_list,
-    ).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
+        token_list=token_list, ).forward(
+            x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
    return [h.asdict() for h in ret]
--- a/deepspeech/decoders/recog.py
+++ b/deepspeech/decoders/recog.py
@ -1,37 +1,57 @@
-"""V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`."""
-
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`."""
 import json
+from pathlib import Path
+
+import jsonlines
 import paddle
 import yaml
 from yacs.config import CfgNode
-from pathlib import Path
-import jsonlines

-# from espnet.asr.asr_utils import get_model_conf
-# from espnet.asr.asr_utils import torch_load
-# from espnet.asr.pytorch_backend.asr import load_trained_model
-# from espnet.nets.lm_interface import dynamic_import_lm
-
-from deepspeech.models.asr_interface import ASRInterface
-
-from .utils import add_results_to_json
-# from .batch_beam_search import BatchBeamSearch
+from .beam_search import BatchBeamSearch
 from .beam_search import BeamSearch
-from .scorers.scorer_interface import BatchScorerInterface
 from .scorers.length_bonus import LengthBonus
-
+from .scorers.scorer_interface import BatchScorerInterface
+from .utils import add_results_to_json
+from deepspeech.exps import dynamic_import_tester
 from deepspeech.io.reader import LoadInputsAndTargets
+from deepspeech.models.asr_interface import ASRInterface
 from deepspeech.utils.log import Log
+# from espnet.asr.asr_utils import get_model_conf
+# from espnet.asr.asr_utils import torch_load
+# from espnet.nets.lm_interface import dynamic_import_lm
+
 logger = Log(__name__).getlog()

+# NOTE: you need this func to generate our sphinx doc

-from deepspeech.utils.dynamic_import import dynamic_import
-from deepspeech.utils.utility import print_arguments

-model_test_alias = {
-    "u2": "deepspeech.exps.u2.model:U2Tester",
-    "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Tester",
-}
+def load_trained_model(args):
+    args.nprocs = args.ngpu
+    confs = CfgNode()
+    confs.set_new_allowed(True)
+    confs.merge_from_file(args.model_conf)
+    class_obj = dynamic_import_tester(args.model_name)
+    exp = class_obj(confs, args)
+    with exp.eval():
+        exp.setup()
+        exp.restore()
+    char_list = exp.args.char_list
+    model = exp.model
+    return model, char_list, exp, confs
+

 def recog_v2(args):
    """Decode with custom models that implements ScorerInterface.
@ -48,33 +68,17 @@ def recog_v2(args):
        raise NotImplementedError("streaming mode is not implemented")
    if args.word_rnnlm:
        raise NotImplementedError("word LM is not implemented")
-    args.nprocs = args.ngpu
-    # set_deterministic(args)
-
-    #model, train_args = load_trained_model(args.model)
-    model_path = Path(args.model)
-    ckpt_dir = model_path.parent.parent
-
-    confs = CfgNode()
-    confs.set_new_allowed(True)
-    confs.merge_from_file(args.model_conf)
-
-    class_obj = dynamic_import(args.model_name, model_test_alias)
-    exp = class_obj(confs, args)
-    with exp.eval():
-        exp.setup()
-        exp.restore()
-    char_list = exp.args.char_list

-    model = exp.model
+    # set_deterministic(args)
+    model, char_list, exp, confs = load_trained_model(args)
    assert isinstance(model, ASRInterface)
+
    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=False,
        sort_in_input_length=False,
        preprocess_conf=confs.collator.augmentation_config
-        if args.preprocess_conf is None
-        else args.preprocess_conf,
+        if args.preprocess_conf is None else args.preprocess_conf,
        preprocess_args={"train": False},
    )

@ -100,7 +104,7 @@ def recog_v2(args):
    else:
        ngram = None

-    scorers = model.scorers()
+    scorers = model.scorers()  # decoder
    scorers["lm"] = lm
    scorers["ngram"] = ngram
    scorers["length_bonus"] = LengthBonus(len(char_list))
@ -125,18 +129,15 @@ def recog_v2(args):
    # TODO(karita): make all scorers batchfied
    if args.batchsize == 1:
        non_batch = [
-            k
-            for k, v in beam_search.full_scorers.items()
+            k for k, v in beam_search.full_scorers.items()
            if not isinstance(v, BatchScorerInterface)
        ]
        if len(non_batch) == 0:
            beam_search.__class__ = BatchBeamSearch
            logger.info("BatchBeamSearch implementation is selected.")
        else:
-            logger.warning(
-                f"As non-batch scorers {non_batch} are found, "
-                f"fall back to non-batch implementation."
-            )
+            logger.warning(f"As non-batch scorers {non_batch} are found, "
+                           f"fall back to non-batch implementation.")

    if args.ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")
@ -157,7 +158,7 @@ def recog_v2(args):
    with jsonlines.open(args.recog_json, "r") as reader:
        for item in reader:
            js.append(item)
-    # josnlines to dict, key by 'utt'
+    # jsonlines to dict, key by 'utt', value by jsonline
    js = {item['utt']: item for item in js}

    new_js = {}
@ -169,25 +170,26 @@ def recog_v2(args):
                feat = load_inputs_and_targets(batch)[0][0]
                logger.info(f'feat: {feat.shape}')
                enc = model.encode(paddle.to_tensor(feat).to(dtype))
-                logger.info(f'eouts: {enc.shape}')
-                nbest_hyps = beam_search(
-                    x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio
-                )
+                logger.info(f'eout: {enc.shape}')
+                nbest_hyps = beam_search(x=enc,
+                                         maxlenratio=args.maxlenratio,
+                                         minlenratio=args.minlenratio)
                nbest_hyps = [
-                    h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)]
+                    h.asdict()
+                    for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)]
                ]
-                new_js[name] = add_results_to_json(
-                    js[name], nbest_hyps, char_list
-                )
+                new_js[name] = add_results_to_json(js[name], nbest_hyps,
+                                                   char_list)

-                item = new_js[name]['output'][0] # 1-best
-                utt = name 
+                item = new_js[name]['output'][0]  # 1-best
                ref = item['text']
-                rec_text = item['rec_text'].replace('▁', ' ').replace('<eos>', '').strip()
-                rec_tokenid = map(int, item['rec_tokenid'].split())
+                rec_text = item['rec_text'].replace('▁',
+                                                    ' ').replace('<eos>',
+                                                                 '').strip()
+                rec_tokenid = list(map(int, item['rec_tokenid'].split()))
                f.write({
-                        "utt": utt,
-                        "refs": [ref],
-                        "hyps": [rec_text],
-                        "hyps_tokenid": [rec_tokenid],
-                    })
+                    "utt": name,
+                    "refs": [ref],
+                    "hyps": [rec_text],
+                    "hyps_tokenid": [rec_tokenid],
+                })
--- a/deepspeech/decoders/recog_bin.py
+++ b/deepspeech/decoders/recog_bin.py
@ -0,0 +1,376 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""End-to-end speech recognition model decoding script."""
+import logging
+import os
+import random
+import sys
+from distutils.util import strtobool
+
+import configargparse
+import numpy as np
+
+from .recog import recog_v2
+
+
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Transcribe text from speech using "
+        "a speech recognition model on one CPU or GPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter, )
+    parser.add(
+        '--model-name',
+        type=str,
+        default='u2_kaldi',
+        help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="Second config file path that overwrites the settings in `--config`",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="Third config file path that overwrites the settings "
+        "in `--config` and `--config2`", )
+
+    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)", )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--seed", type=int, default=1, help="Random seed")
+    parser.add_argument(
+        "--verbose", "-V", type=int, default=2, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        type=int,
+        default=1,
+        help="Batch size for beam search (0: means no batch processing)", )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing", )
+    parser.add_argument(
+        "--api",
+        default="v2",
+        choices=["v2"],
+        help="Beam search APIs "
+        "v2: Experimental API. It supports any models that implements ScorerInterface.",
+    )
+    # task related
+    parser.add_argument(
+        "--recog-json", type=str, help="Filename of recognition data (json)")
+    parser.add_argument(
+        "--result-label",
+        type=str,
+        required=True,
+        help="Filename of result label data (json)", )
+    # model (parameter) related
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Model file parameters to read")
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file")
+    parser.add_argument(
+        "--num-spkrs",
+        type=int,
+        default=1,
+        choices=[1, 2],
+        help="Number of speakers in the speech", )
+    parser.add_argument(
+        "--num-encs",
+        default=1,
+        type=int,
+        help="Number of encoders in the model.")
+    # search related
+    parser.add_argument(
+        "--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
+    parser.add_argument(
+        "--penalty", type=float, default=0.0, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths.
+                        If maxlenratio<0.0, its absolute value is interpreted
+                        as a constant max output length""", )
+    parser.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length", )
+    parser.add_argument(
+        "--ctc-weight",
+        type=float,
+        default=0.0,
+        help="CTC weight in joint decoding")
+    parser.add_argument(
+        "--weights-ctc-dec",
+        type=float,
+        action="append",
+        help="ctc weight assigned to each encoder during decoding."
+        "[in multi-encoder mode only]", )
+    parser.add_argument(
+        "--ctc-window-margin",
+        type=int,
+        default=0,
+        help="""Use CTC window with margin parameter to accelerate
+                        CTC/attention decoding especially on GPU. Smaller magin
+                        makes decoding faster, but may increase search errors.
+                        If margin=0 (default), this function is disabled""", )
+    # transducer related
+    parser.add_argument(
+        "--search-type",
+        type=str,
+        default="default",
+        choices=["default", "nsc", "tsd", "alsd", "maes"],
+        help="""Type of beam search implementation to use during inference.
+        Can be either: default beam search ("default"),
+        N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"),
+        Alignment-Length Synchronous Decoding ("alsd") or
+        modified Adaptive Expansion Search ("maes").""", )
+    parser.add_argument(
+        "--nstep",
+        type=int,
+        default=1,
+        help="""Number of expansion steps allowed in NSC beam search or mAES
+        (nstep > 0 for NSC and nstep > 1 for mAES).""", )
+    parser.add_argument(
+        "--prefix-alpha",
+        type=int,
+        default=2,
+        help="Length prefix difference allowed in NSC beam search or mAES.", )
+    parser.add_argument(
+        "--max-sym-exp",
+        type=int,
+        default=2,
+        help="Number of symbol expansions allowed in TSD.", )
+    parser.add_argument(
+        "--u-max",
+        type=int,
+        default=400,
+        help="Length prefix difference allowed in ALSD.", )
+    parser.add_argument(
+        "--expansion-gamma",
+        type=float,
+        default=2.3,
+        help="Allowed logp difference for prune-by-value method in mAES.", )
+    parser.add_argument(
+        "--expansion-beta",
+        type=int,
+        default=2,
+        help="""Number of additional candidates for expanded hypotheses
+                selection in mAES.""", )
+    parser.add_argument(
+        "--score-norm",
+        type=strtobool,
+        nargs="?",
+        default=True,
+        help="Normalize final hypotheses' score by length", )
+    parser.add_argument(
+        "--softmax-temperature",
+        type=float,
+        default=1.0,
+        help="Penalization term for softmax function.", )
+    # rnnlm related
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read")
+    parser.add_argument(
+        "--rnnlm-conf",
+        type=str,
+        default=None,
+        help="RNNLM model config file to read")
+    parser.add_argument(
+        "--word-rnnlm",
+        type=str,
+        default=None,
+        help="Word RNNLM model file to read")
+    parser.add_argument(
+        "--word-rnnlm-conf",
+        type=str,
+        default=None,
+        help="Word RNNLM model config file to read", )
+    parser.add_argument(
+        "--word-dict", type=str, default=None, help="Word list to read")
+    parser.add_argument(
+        "--lm-weight", type=float, default=0.1, help="RNNLM weight")
+    # ngram related
+    parser.add_argument(
+        "--ngram-model",
+        type=str,
+        default=None,
+        help="ngram model file to read")
+    parser.add_argument(
+        "--ngram-weight", type=float, default=0.1, help="ngram weight")
+    parser.add_argument(
+        "--ngram-scorer",
+        type=str,
+        default="part",
+        choices=("full", "part"),
+        help="""if the ngram is set as a part scorer, similar with CTC scorer,
+                ngram scorer only scores topK hypethesis.
+                if the ngram is set as full scorer, ngram scorer scores all hypthesis
+                the decoding speed of part scorer is musch faster than full one""",
+    )
+    # streaming related
+    parser.add_argument(
+        "--streaming-mode",
+        type=str,
+        default=None,
+        choices=["window", "segment"],
+        help="""Use streaming recognizer for inference.
+                        `--batchsize` must be set to 0 to enable this mode""", )
+    parser.add_argument(
+        "--streaming-window", type=int, default=10, help="Window size")
+    parser.add_argument(
+        "--streaming-min-blank-dur",
+        type=int,
+        default=10,
+        help="Minimum blank duration threshold", )
+    parser.add_argument(
+        "--streaming-onset-margin", type=int, default=1, help="Onset margin")
+    parser.add_argument(
+        "--streaming-offset-margin", type=int, default=1, help="Offset margin")
+    # non-autoregressive related
+    # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail.
+    parser.add_argument(
+        "--maskctc-n-iterations",
+        type=int,
+        default=10,
+        help="Number of decoding iterations."
+        "For Mask CTC, set 0 to predict 1 mask/iter.", )
+    parser.add_argument(
+        "--maskctc-probability-threshold",
+        type=float,
+        default=0.999,
+        help="Threshold probability for CTC output", )
+    # quantize model related
+    parser.add_argument(
+        "--quantize-config",
+        nargs="*",
+        help="Quantize config list. E.g.: --quantize-config=[Linear,LSTM,GRU]",
+    )
+    parser.add_argument(
+        "--quantize-dtype",
+        type=str,
+        default="qint8",
+        help="Dtype dynamic quantize")
+    parser.add_argument(
+        "--quantize-asr-model",
+        type=bool,
+        default=False,
+        help="Quantize asr model", )
+    parser.add_argument(
+        "--quantize-lm-model",
+        type=bool,
+        default=False,
+        help="Quantize lm model", )
+    return parser
+
+
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    parser.add_argument(
+        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
+    parser.add_argument(
+        "--checkpoint_path", type=str, help="path to load checkpoint")
+    parser.add_argument("--dict-path", type=str, help="path to load checkpoint")
+    args = parser.parse_args(args)
+
+    if args.ngpu == 0 and args.dtype == "float16":
+        raise ValueError(
+            f"--dtype {args.dtype} does not support the CPU backend.")
+
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    logging.info(args)
+
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+
+        # TODO(mn5k): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+
+    # validate rnn options
+    if args.rnnlm is not None and args.word_rnnlm is not None:
+        logging.error(
+            "It seems that both --rnnlm and --word-rnnlm are specified. "
+            "Please use either option.")
+        sys.exit(1)
+
+    # recog
+    if args.num_spkrs == 1:
+        if args.num_encs == 1:
+            # Experimental API that supports custom LMs
+            if args.api == "v2":
+                from deepspeech.decoders.recog import recog_v2
+                recog_v2(args)
+            else:
+                raise ValueError("Only support --api v2")
+        else:
+            if args.api == "v2":
+                raise NotImplementedError(
+                    f"--num-encs {args.num_encs} > 1 is not supported in --api v2"
+                )
+    elif args.num_spkrs == 2:
+        raise ValueError("asr_mix not supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/deepspeech/decoders/scorers/ngram.py
+++ b/deepspeech/decoders/scorers/ngram.py
@ -85,8 +85,9 @@ class NgramFullScorer(Ngrambase, BatchScorerInterface):
                and next state list for ys.

        """
-        return self.score_partial_(
-            y, paddle.to_tensor(range(self.charlen)), state, x)
+        return self.score_partial_(y,
+                                   paddle.to_tensor(range(self.charlen)), state,
+                                   x)


 class NgramPartScorer(Ngrambase, PartialScorerInterface):
--- a/deepspeech/decoders/utils.py
+++ b/deepspeech/decoders/utils.py
@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import numpy as np
+
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()

@ -98,7 +98,8 @@ def add_results_to_json(js, nbest_hyps, char_list):

    for n, hyp in enumerate(nbest_hyps, 1):
        # parse hypothesis
-        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list)
+        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp,
+                                                                   char_list)

        # copy ground-truth
        if len(js["output"]) > 0:
@ -125,4 +126,4 @@ def add_results_to_json(js, nbest_hyps, char_list):
                logger.info("groundtruth: %s" % out_dic["text"])
            logger.info("prediction : %s" % out_dic["rec_text"])

-    return new_js
+    return new_js
--- a/deepspeech/exps/init.py
+++ b/deepspeech/exps/init.py
@ -11,3 +11,52 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from deepspeech.training.trainer import Trainer
+from deepspeech.utils.dynamic_import import dynamic_import
+
+model_trainer_alias = {
+    "ds2": "deepspeech.exp.deepspeech2.model:DeepSpeech2Trainer",
+    "u2": "deepspeech.exps.u2.model:U2Trainer",
+    "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Trainer",
+    "u2_st": "deepspeech.exps.u2_st.model:U2STTrainer",
+}
+
+
+def dynamic_import_trainer(module):
+    """Import Trainer dynamically.
+
+    Args:
+        module (str): trainer name. e.g., ds2, u2, u2_kaldi
+
+    Returns:
+        type: Trainer class
+
+    """
+    model_class = dynamic_import(module, model_trainer_alias)
+    assert issubclass(model_class,
+                      Trainer), f"{module} does not implement Trainer"
+    return model_class
+
+
+model_tester_alias = {
+    "ds2": "deepspeech.exp.deepspeech2.model:DeepSpeech2Tester",
+    "u2": "deepspeech.exps.u2.model:U2Tester",
+    "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Tester",
+    "u2_st": "deepspeech.exps.u2_st.model:U2STTester",
+}
+
+
+def dynamic_import_tester(module):
+    """Import Tester dynamically.
+
+    Args:
+        module (str): tester name. e.g., ds2, u2, u2_kaldi
+
+    Returns:
+        type: Tester class
+
+    """
+    model_class = dynamic_import(module, model_tester_alias)
+    assert issubclass(model_class,
+                      Trainer), f"{module} does not implement Tester"
+    return model_class
--- a/deepspeech/exps/u2_kaldi/bin/recog.py
+++ b/deepspeech/exps/u2_kaldi/bin/recog.py
@ -1,379 +1,19 @@
-
-"""End-to-end speech recognition model decoding script."""
-
-import configargparse
-import logging
-import os
-import random
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import sys

-import numpy as np
-
-from distutils.util import strtobool
-from deepspeech.training.cli import default_argument_parser
-
-# NOTE: you need this func to generate our sphinx doc
-
-def get_parser():
-    """Get default arguments."""
-    parser = configargparse.ArgumentParser(
-        description="Transcribe text from speech using "
-        "a speech recognition model on one CPU or GPU",
-        config_file_parser_class=configargparse.YAMLConfigFileParser,
-        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add(
-        '--model-name',
-        type=str,
-        default='u2_kaldi',
-        help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
-    # general configuration
-    parser.add("--config", is_config_file=True, help="Config file path")
-    parser.add(
-        "--config2",
-        is_config_file=True,
-        help="Second config file path that overwrites the settings in `--config`",
-    )
-    parser.add(
-        "--config3",
-        is_config_file=True,
-        help="Third config file path that overwrites the settings "
-        "in `--config` and `--config2`",
-    )
-
-    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
-    parser.add_argument(
-        "--dtype",
-        choices=("float16", "float32", "float64"),
-        default="float32",
-        help="Float precision (only available in --api v2)",
-    )
-    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
-    parser.add_argument("--seed", type=int, default=1, help="Random seed")
-    parser.add_argument("--verbose", "-V", type=int, default=2, help="Verbose option")
-    parser.add_argument(
-        "--batchsize",
-        type=int,
-        default=1,
-        help="Batch size for beam search (0: means no batch processing)",
-    )
-    parser.add_argument(
-        "--preprocess-conf",
-        type=str,
-        default=None,
-        help="The configuration file for the pre-processing",
-    )
-    parser.add_argument(
-        "--api",
-        default="v2",
-        choices=["v2"],
-        help="Beam search APIs "
-        "v2: Experimental API. It supports any models that implements ScorerInterface.",
-    )
-    # task related
-    parser.add_argument(
-        "--recog-json", type=str, help="Filename of recognition data (json)"
-    )
-    parser.add_argument(
-        "--result-label",
-        type=str,
-        required=True,
-        help="Filename of result label data (json)",
-    )
-    # model (parameter) related
-    parser.add_argument(
-        "--model", type=str, required=True, help="Model file parameters to read"
-    )
-    parser.add_argument(
-        "--model-conf", type=str, default=None, help="Model config file"
-    )
-    parser.add_argument(
-        "--num-spkrs",
-        type=int,
-        default=1,
-        choices=[1, 2],
-        help="Number of speakers in the speech",
-    )
-    parser.add_argument(
-        "--num-encs", default=1, type=int, help="Number of encoders in the model."
-    )
-    # search related
-    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
-    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
-    parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty")
-    parser.add_argument(
-        "--maxlenratio",
-        type=float,
-        default=0.0,
-        help="""Input length ratio to obtain max output length.
-                        If maxlenratio=0.0 (default), it uses a end-detect function
-                        to automatically find maximum hypothesis lengths.
-                        If maxlenratio<0.0, its absolute value is interpreted
-                        as a constant max output length""",
-    )
-    parser.add_argument(
-        "--minlenratio",
-        type=float,
-        default=0.0,
-        help="Input length ratio to obtain min output length",
-    )
-    parser.add_argument(
-        "--ctc-weight", type=float, default=0.0, help="CTC weight in joint decoding"
-    )
-    parser.add_argument(
-        "--weights-ctc-dec",
-        type=float,
-        action="append",
-        help="ctc weight assigned to each encoder during decoding."
-        "[in multi-encoder mode only]",
-    )
-    parser.add_argument(
-        "--ctc-window-margin",
-        type=int,
-        default=0,
-        help="""Use CTC window with margin parameter to accelerate
-                        CTC/attention decoding especially on GPU. Smaller magin
-                        makes decoding faster, but may increase search errors.
-                        If margin=0 (default), this function is disabled""",
-    )
-    # transducer related
-    parser.add_argument(
-        "--search-type",
-        type=str,
-        default="default",
-        choices=["default", "nsc", "tsd", "alsd", "maes"],
-        help="""Type of beam search implementation to use during inference.
-        Can be either: default beam search ("default"),
-        N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"),
-        Alignment-Length Synchronous Decoding ("alsd") or
-        modified Adaptive Expansion Search ("maes").""",
-    )
-    parser.add_argument(
-        "--nstep",
-        type=int,
-        default=1,
-        help="""Number of expansion steps allowed in NSC beam search or mAES
-        (nstep > 0 for NSC and nstep > 1 for mAES).""",
-    )
-    parser.add_argument(
-        "--prefix-alpha",
-        type=int,
-        default=2,
-        help="Length prefix difference allowed in NSC beam search or mAES.",
-    )
-    parser.add_argument(
-        "--max-sym-exp",
-        type=int,
-        default=2,
-        help="Number of symbol expansions allowed in TSD.",
-    )
-    parser.add_argument(
-        "--u-max",
-        type=int,
-        default=400,
-        help="Length prefix difference allowed in ALSD.",
-    )
-    parser.add_argument(
-        "--expansion-gamma",
-        type=float,
-        default=2.3,
-        help="Allowed logp difference for prune-by-value method in mAES.",
-    )
-    parser.add_argument(
-        "--expansion-beta",
-        type=int,
-        default=2,
-        help="""Number of additional candidates for expanded hypotheses
-                selection in mAES.""",
-    )
-    parser.add_argument(
-        "--score-norm",
-        type=strtobool,
-        nargs="?",
-        default=True,
-        help="Normalize final hypotheses' score by length",
-    )
-    parser.add_argument(
-        "--softmax-temperature",
-        type=float,
-        default=1.0,
-        help="Penalization term for softmax function.",
-    )
-    # rnnlm related
-    parser.add_argument(
-        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
-    )
-    parser.add_argument(
-        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
-    )
-    parser.add_argument(
-        "--word-rnnlm", type=str, default=None, help="Word RNNLM model file to read"
-    )
-    parser.add_argument(
-        "--word-rnnlm-conf",
-        type=str,
-        default=None,
-        help="Word RNNLM model config file to read",
-    )
-    parser.add_argument("--word-dict", type=str, default=None, help="Word list to read")
-    parser.add_argument("--lm-weight", type=float, default=0.1, help="RNNLM weight")
-    # ngram related
-    parser.add_argument(
-        "--ngram-model", type=str, default=None, help="ngram model file to read"
-    )
-    parser.add_argument("--ngram-weight", type=float, default=0.1, help="ngram weight")
-    parser.add_argument(
-        "--ngram-scorer",
-        type=str,
-        default="part",
-        choices=("full", "part"),
-        help="""if the ngram is set as a part scorer, similar with CTC scorer,
-                ngram scorer only scores topK hypethesis.
-                if the ngram is set as full scorer, ngram scorer scores all hypthesis
-                the decoding speed of part scorer is musch faster than full one""",
-    )
-    # streaming related
-    parser.add_argument(
-        "--streaming-mode",
-        type=str,
-        default=None,
-        choices=["window", "segment"],
-        help="""Use streaming recognizer for inference.
-                        `--batchsize` must be set to 0 to enable this mode""",
-    )
-    parser.add_argument("--streaming-window", type=int, default=10, help="Window size")
-    parser.add_argument(
-        "--streaming-min-blank-dur",
-        type=int,
-        default=10,
-        help="Minimum blank duration threshold",
-    )
-    parser.add_argument(
-        "--streaming-onset-margin", type=int, default=1, help="Onset margin"
-    )
-    parser.add_argument(
-        "--streaming-offset-margin", type=int, default=1, help="Offset margin"
-    )
-    # non-autoregressive related
-    # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail.
-    parser.add_argument(
-        "--maskctc-n-iterations",
-        type=int,
-        default=10,
-        help="Number of decoding iterations."
-        "For Mask CTC, set 0 to predict 1 mask/iter.",
-    )
-    parser.add_argument(
-        "--maskctc-probability-threshold",
-        type=float,
-        default=0.999,
-        help="Threshold probability for CTC output",
-    )
-    # quantize model related
-    parser.add_argument(
-        "--quantize-config",
-        nargs="*",
-        help="Quantize config list. E.g.: --quantize-config=[Linear,LSTM,GRU]",
-    )
-    parser.add_argument(
-        "--quantize-dtype", type=str, default="qint8", help="Dtype dynamic quantize"
-    )
-    parser.add_argument(
-        "--quantize-asr-model",
-        type=bool,
-        default=False,
-        help="Quantize asr model",
-    )
-    parser.add_argument(
-        "--quantize-lm-model",
-        type=bool,
-        default=False,
-        help="Quantize lm model",
-    )
-    return parser
-
-
-def main(args):
-    """Run the main decoding function."""
-    parser = get_parser()
-    parser.add_argument(
-        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
-    parser.add_argument(
-        "--checkpoint_path", type=str, help="path to load checkpoint")
-    parser.add_argument(
-        "--dict-path", type=str, help="path to load checkpoint")
-    # parser = default_argument_parser(parser)
-    args = parser.parse_args(args)
-
-    if args.ngpu == 0 and args.dtype == "float16":
-        raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.")
-
-    # logging info
-    if args.verbose == 1:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-        )
-    elif args.verbose == 2:
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-        )
-    else:
-        logging.basicConfig(
-            level=logging.WARN,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-        )
-        logging.warning("Skip DEBUG/INFO messages")
-    logging.info(args)
-
-    # check CUDA_VISIBLE_DEVICES
-    if args.ngpu > 0:
-        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
-        if cvd is None:
-            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
-        elif args.ngpu != len(cvd.split(",")):
-            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
-            sys.exit(1)
-
-        # TODO(mn5k): support of multiple GPUs
-        if args.ngpu > 1:
-            logging.error("The program only supports ngpu=1.")
-            sys.exit(1)
-
-    # display PYTHONPATH
-    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
-
-    # seed setting
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    logging.info("set random seed = %d" % args.seed)
-
-    # validate rnn options
-    if args.rnnlm is not None and args.word_rnnlm is not None:
-        logging.error(
-            "It seems that both --rnnlm and --word-rnnlm are specified. "
-            "Please use either option."
-        )
-        sys.exit(1)
-
-    # recog
-    if args.num_spkrs == 1:
-        if args.num_encs == 1:
-            # Experimental API that supports custom LMs
-            if args.api == "v2":
-                from deepspeech.decoders.recog import recog_v2
-                recog_v2(args)
-            else:
-                raise ValueError("Only support --api v2")
-        else:
-            if args.api == "v2":
-                raise NotImplementedError(
-                    f"--num-encs {args.num_encs} > 1 is not supported in --api v2"
-                )
-    elif args.num_spkrs == 2:
-        raise ValueError("asr_mix not supported.")
-
+from deepspeech.decoders.recog_bin import main

 if __name__ == "__main__":
    main(sys.argv[1:])
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@ -434,8 +434,9 @@ class U2Tester(U2Trainer):
            simulate_streaming=cfg.simulate_streaming)
        decode_time = time.time() - start_time

-        for i, (utt, target, result, rec_tids) in enumerate(zip(
-                utts, target_transcripts, result_transcripts, result_tokenids)):
+        for i, (utt, target, result, rec_tids) in enumerate(
+                zip(utts, target_transcripts, result_transcripts,
+                    result_tokenids)):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -140,7 +140,7 @@ class TextFeaturizer():
        Returns:
           str: text string.
        """
-        tokens = [t.replace(SPACE, " ") for t in tokens ]
+        tokens = [t.replace(SPACE, " ") for t in tokens]
        return "".join(tokens)

    def word_tokenize(self, text):
--- a/deepspeech/models/asr_interface.py
+++ b/deepspeech/models/asr_interface.py
@ -1,3 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """ASR Interface module."""
 import argparse

@ -72,7 +85,8 @@ class ASRInterface:
        :return: attention weights (B, Lmax, Tmax)
        :rtype: float ndarray
        """
-        raise NotImplementedError("calculate_all_attentions method is not implemented")
+        raise NotImplementedError(
+            "calculate_all_attentions method is not implemented")

    def calculate_all_ctc_probs(self, xs, ilens, ys):
        """Calculate CTC probability.
@ -83,7 +97,8 @@ class ASRInterface:
        :return: CTC probabilities (B, Tmax, vocab)
        :rtype: float ndarray
        """
-        raise NotImplementedError("calculate_all_ctc_probs method is not implemented")
+        raise NotImplementedError(
+            "calculate_all_ctc_probs method is not implemented")

    @property
    def attention_plot_class(self):
@ -102,8 +117,7 @@ class ASRInterface:
    def get_total_subsampling_factor(self):
        """Get total subsampling factor."""
        raise NotImplementedError(
-            "get_total_subsampling_factor method is not implemented"
-        )
+            "get_total_subsampling_factor method is not implemented")

    def encode(self, feat):
        """Encode feature in `beam_search` (optional).
@ -126,23 +140,22 @@ class ASRInterface:


 predefined_asr = {
-        "transformer": "deepspeech.models.u2:E2E",
-        "conformer": "deepspeech.models.u2:E2E",
+    "transformer": "deepspeech.models.u2:U2Model",
+    "conformer": "deepspeech.models.u2:U2Model",
 }

-def dynamic_import_asr(module, name):
+
+def dynamic_import_asr(module):
    """Import ASR models dynamically.

    Args:
-        module (str): module_name:class_name or alias in `predefined_asr`
-        name (str): asr name. e.g., transformer, conformer
+        module (str): asr name. e.g., transformer, conformer

    Returns:
        type: ASR class

    """
-    model_class = dynamic_import(module, predefined_asr.get(name, ""))
-    assert issubclass(
-        model_class, ASRInterface
-    ), f"{module} does not implement ASRInterface"
+    model_class = dynamic_import(module, predefined_asr)
+    assert issubclass(model_class,
+                      ASRInterface), f"{module} does not implement ASRInterface"
    return model_class
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@ -28,8 +28,10 @@ from paddle import jit
 from paddle import nn
 from yacs.config import CfgNode

+from deepspeech.decoders.scorers.ctc import CTCPrefixScorer
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.frontend.utility import load_cmvn
+from deepspeech.models.asr_interface import ASRInterface
 from deepspeech.modules.cmvn import GlobalCMVN
 from deepspeech.modules.ctc import CTCDecoder
 from deepspeech.modules.decoder import TransformerDecoder
@ -49,8 +51,6 @@ from deepspeech.utils.tensor_utils import pad_sequence
 from deepspeech.utils.tensor_utils import th_accuracy
 from deepspeech.utils.utility import log_add
 from deepspeech.utils.utility import UpdateConfig
-from deepspeech.models.asr_interface import ASRInterface
-from deepspeech.decoders.scorers.ctc import CTCPrefixScorer

 __all__ = ["U2Model", "U2InferModel"]

@ -816,10 +816,10 @@ class U2BaseModel(ASRInterface, nn.Layer):


 class U2DecodeModel(U2BaseModel):
-
    def scorers(self):
        """Scorers."""
-        return dict(decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos))
+        return dict(
+            decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos))

    def encode(self, x):
        """Encode acoustic features.
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@ -12,23 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Decoder definition."""
+from typing import Any
 from typing import List
 from typing import Optional
 from typing import Tuple
-from typing import Any

 import paddle
 from paddle import nn
 from typeguard import check_argument_types

+from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface
 from deepspeech.modules.attention import MultiHeadedAttention
 from deepspeech.modules.decoder_layer import DecoderLayer
 from deepspeech.modules.embedding import PositionalEncoding
 from deepspeech.modules.mask import make_non_pad_mask
-from deepspeech.modules.mask import subsequent_mask
 from deepspeech.modules.mask import make_xs_mask
+from deepspeech.modules.mask import subsequent_mask
 from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward
-from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface
 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()
@ -191,8 +191,8 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
        ys: (ylen,)
        x: (xlen, n_feat)
        """
-        ys_mask = subsequent_mask(len(ys)).unsqueeze(0) # (B,L,L)
-        x_mask = make_xs_mask(x.unsqueeze(0)).unsqueeze(1) # (B,1,T)
+        ys_mask = subsequent_mask(len(ys)).unsqueeze(0)  # (B,L,L)
+        x_mask = make_xs_mask(x.unsqueeze(0)).unsqueeze(1)  # (B,1,T)
        if self.selfattention_layer_type != "selfattn":
            # TODO(karita): implement cache
            logging.warning(
@ -200,16 +200,14 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
            )
            state = None
        logp, state = self.forward_one_step(
-            x.unsqueeze(0), x_mask, 
-            ys.unsqueeze(0), ys_mask,
-            cache=state
-        )
+            x.unsqueeze(0), x_mask, ys.unsqueeze(0), ys_mask, cache=state)
        return logp.squeeze(0), state

    # batch beam search API (see BatchScorerInterface)
-    def batch_score(
-        self, ys: paddle.Tensor, states: List[Any], xs: paddle.Tensor
-    ) -> Tuple[paddle.Tensor, List[Any]]:
+    def batch_score(self,
+                    ys: paddle.Tensor,
+                    states: List[Any],
+                    xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
        """Score new token batch (required).

        Args:
@ -237,10 +235,12 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
            ]

        # batch decoding
-        ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0) # (B,L,L)
-        xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T)
-        logp, states = self.forward_one_step(xs, xs_mask, ys, ys_mask, cache=batch_state)
+        ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0)  # (B,L,L)
+        xs_mask = make_xs_mask(xs).unsqueeze(1)  # (B,1,T)
+        logp, states = self.forward_one_step(
+            xs, xs_mask, ys, ys_mask, cache=batch_state)

        # transpose state of [layer, batch] into [batch, layer]
-        state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)]
+        state_list = [[states[i][b] for i in range(n_layers)]
+                      for b in range(n_batch)]
        return logp, state_list
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@ -24,7 +24,7 @@ __all__ = [
 ]


-def make_xs_mask(xs:paddle.Tensor, pad_value=0.0) -> paddle.Tensor:
+def make_xs_mask(xs: paddle.Tensor, pad_value=0.0) -> paddle.Tensor:
    """Maks mask tensor containing indices of non-padded part.
    Args:
        xs (paddle.Tensor): (B, T, D), zeros for pad.
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@ -64,7 +64,7 @@ def default_argument_parser(parser=None):
    """
    if parser is None:
        parser = argparse.ArgumentParser()
-        
+
    parser.register('action', 'extend', ExtendAction)
    parser.add_argument(
        '--conf', type=open, action=LoadFromFile, help="config file.")
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -126,7 +126,8 @@ class Trainer():
            logger.info(f"Set seed {args.seed}")

        # profiler and benchmark options
-        if hasattr(self.args, "benchmark_batch_size") and self.args.benchmark_batch_size:
+        if hasattr(self.args,
+                   "benchmark_batch_size") and self.args.benchmark_batch_size:
            with UpdateConfig(self.config):
                self.config.collator.batch_size = self.args.benchmark_batch_size
                self.config.training.log_interval = 1
@ -335,8 +336,7 @@ class Trainer():
        """
        assert self.args.checkpoint_path
        infos = self.checkpoint.load_latest_parameters(
-            self.model,
-            checkpoint_path=self.args.checkpoint_path)
+            self.model, checkpoint_path=self.args.checkpoint_path)
        return infos

    def run_test(self):
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@ -1,8 +1,8 @@
 # ASR

-* s0 is for deepspeech2 offline 
-* s1 is for transformer/conformer/U2 
-* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi 
+* s0 is for deepspeech2 offline
+* s1 is for transformer/conformer/U2
+* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi

 ## Data
 | Data Subset | Duration in Seconds |
--- a/examples/librispeech/s2/README.md
+++ b/examples/librispeech/s2/README.md
@ -1,14 +1,14 @@
 # LibriSpeech

 | Model | Params | Config | Augmentation| Loss |
-| --- | --- | --- | --- | 
+| --- | --- | --- | --- |
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug | 6.3197922706604 |


-| Test Set | Decode Method | #Snt | #Wrd | Corr | Sub | Del | Ins | Err | S.Err |   
+| Test Set | Decode Method | #Snt | #Wrd | Corr | Sub | Del | Ins | Err | S.Err |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| test-clean | attention | 2620 | 52576 | 96.4 | 2.5 | 1.1 | 0.4 | 4.0 | 34.7 |   
-| test-clean | ctc_greedy_search | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 48.0 |   
-| test-clean | ctc_prefix_beamsearch | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 47.6 |   
-| test-clean | attention_rescore | 2620 | 52576 | 96.8 | 2.9 | 0.3 | 0.4 | 3.7 | 38.0 |   
-| test-clean | join_ctc_w/o_lm | 2620 | 52576 | 97.2 | 2.6 | 0.3 | 0.4 | 3.2 | 34.9 |   
+| test-clean | attention | 2620 | 52576 | 96.4 | 2.5 | 1.1 | 0.4 | 4.0 | 34.7 |  
+| test-clean | ctc_greedy_search | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 48.0 |  
+| test-clean | ctc_prefix_beamsearch | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 47.6 |  
+| test-clean | attention_rescore | 2620 | 52576 | 96.8 | 2.9 | 0.3 | 0.4 | 3.7 | 38.0 |  
+| test-clean | join_ctc_w/o_lm | 2620 | 52576 | 97.2 | 2.6 | 0.3 | 0.4 | 3.2 | 34.9 |  
--- a/examples/librispeech/s2/conf/decode/decode.yaml
+++ b/examples/librispeech/s2/conf/decode/decode.yaml
@ -1,6 +1,6 @@
 batchsize: 0
 beam-size: 60
-ctc-weight: 0.4
+ctc-weight: 0.0
 lm-weight: 0.0
 maxlenratio: 0.0
 minlenratio: 0.0
--- a/examples/librispeech/s2/local/recog.sh
+++ b/examples/librispeech/s2/local/recog.sh
@ -5,11 +5,14 @@ set -e
 expdir=exp
 datadir=data
 nj=32
+tag=

+# decode config
 decode_config=conf/decode/decode.yaml
+
+# lm params
 lang_model=rnnlm.model.best
 lmexpdir=exp/train_rnnlm_pytorch_lm_transformer_cosine_batchsize32_lr1e-4_layer16_unigram5000_ngpu4/
-
 lmtag='nolm'

 recog_set="test-clean test-other dev-clean dev-other"
@ -21,18 +24,21 @@ bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
 bpemodel=${bpeprefix}.model

-if [ $# != 3 ];then
-    echo "usage: ${0} config_path dict_path ckpt_path_prefix"
-    exit -1
+# bin params
+config_path=conf/transformer.yaml
+dict=data/bpe_unigram_5000_units.txt
+ckpt_prefix=
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+if [ -z ${ckpt_prefix} ]; then
+    echo "usage: $0 --ckpt_prefix ckpt_prefix"
+    exit 1
 fi

 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."

-config_path=$1
-dict=$2
-ckpt_prefix=$3
-
 ckpt_dir=$(dirname `dirname ${ckpt_prefix}`)
 echo "ckpt dir: ${ckpt_dir}"

@ -61,7 +67,7 @@ for dmethd in join_ctc; do
    for rtask in ${recog_set}; do
    (
        echo "${rtask} dataset"
-        decode_dir=${ckpt_dir}/decode/decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag}_${ckpt_tag}
+        decode_dir=${ckpt_dir}/decode/decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag}_${ckpt_tag}_${tag}
        feat_recog_dir=${datadir}
        mkdir -p ${decode_dir}
        mkdir -p ${feat_recog_dir}
--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
@ -17,19 +17,20 @@ bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
 bpemodel=${bpeprefix}.model

-if [ $# != 3 ];then
-    echo "usage: ${0} config_path dict_path ckpt_path_prefix"
-    exit -1
+config_path=conf/transformer.yaml
+dict=data/bpe_unigram_5000_units.txt
+ckpt_prefix=
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+if [ -z ${ckpt_prefix} ]; then
+    echo "usage: $0 --ckpt_prefix ckpt_prefix"
+    exit 1
 fi

 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."

-config_path=$1
-dict=$2
-ckpt_prefix=$3
-
-
 ckpt_dir=$(dirname `dirname ${ckpt_prefix}`)
 echo "ckpt dir: ${ckpt_dir}"

--- a/requirements.txt
+++ b/requirements.txt
@ -1,43 +1,43 @@
+ConfigArgParse
 coverage
 editdistance
+g2p_en
+g2pM
 gpustat
+h5py
+inflect
+jieba
 jsonlines
 kaldiio
+librosa
+llvmlite
 loguru
+matplotlib
+nltk
+numba
+numpy==1.20.0
+pandas
+phkit
 Pillow
+praatio~=4.1
 pre-commit
 pybind11
+pypinyin
+pyworld
 resampy==0.2.2
 sacrebleu
 scipy==1.2.1
 sentencepiece
 snakeviz
+soundfile~=0.10
 sox
 tensorboardX
 textgrid
+timer
 tqdm
 typeguard
-visualdl==2.2.0
-yacs
-numpy==1.20.0
-numba
-nltk
-inflect
-librosa
 unidecode
-llvmlite
-matplotlib
-pandas
-soundfile~=0.10
-g2p_en
-pypinyin
+visualdl==2.2.0
 webrtcvad
-g2pM
-praatio~=4.1
-h5py
-timer
-pyworld
-jieba
-phkit
+yacs
 yq
-ConfigArgParse
--- a/setup.py
+++ b/setup.py
@ -11,20 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
+import inspect
 import io
 import os
 import re
+import subprocess as sp
 import sys
 from pathlib import Path
-import contextlib
-import inspect

+from setuptools import Command
 from setuptools import find_packages
 from setuptools import setup
-from setuptools import Command
 from setuptools.command.develop import develop
 from setuptools.command.install import install
-import subprocess as sp

 HERE = Path(os.path.abspath(os.path.dirname(__file__)))

@ -40,16 +40,18 @@ def pushd(new_dir):


 def read(*names, **kwargs):
-    with io.open(os.path.join(os.path.dirname(__file__), *names),
-                 encoding=kwargs.get("encoding", "utf8")) as fp:
+    with io.open(
+            os.path.join(os.path.dirname(__file__), *names),
+            encoding=kwargs.get("encoding", "utf8")) as fp:
        return fp.read()


 def check_call(cmd: str, shell=False, executable=None):
    try:
-        sp.check_call(cmd.split(),
-                      shell=shell,
-                      executable="/bin/bash" if shell else executable)
+        sp.check_call(
+            cmd.split(),
+            shell=shell,
+            executable="/bin/bash" if shell else executable)
    except sp.CalledProcessError as e:
        print(
            f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:",
@ -189,7 +191,6 @@ setup_info = dict(
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
-    ],
-)
+    ], )

 setup(**setup_info)