From f2f305cd66eac760552102fd8e469f6792904690 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 22 Oct 2021 08:34:59 +0000
Subject: [PATCH] add recog interface

---
 deepspeech/decoders/recog.py       | 154 +++++++++++++++++++++++++++++
 deepspeech/decoders/utils.py       |  75 ++++++++++++++
 deepspeech/models/asr_interface.py | 148 +++++++++++++++++++++++++++
 deepspeech/models/u2/u2.py         |  28 +++++-
 deepspeech/modules/decoder.py      |  69 ++++++++++++-
 deepspeech/modules/mask.py         |  16 ++-
 6 files changed, 484 insertions(+), 6 deletions(-)
 create mode 100644 deepspeech/decoders/recog.py
 create mode 100644 deepspeech/models/asr_interface.py

diff --git a/deepspeech/decoders/recog.py b/deepspeech/decoders/recog.py
new file mode 100644
index 00000000..399c5c54
--- /dev/null
+++ b/deepspeech/decoders/recog.py
@@ -0,0 +1,154 @@
+"""V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`."""
+
+import json
+import paddle
+
+# from espnet.asr.asr_utils import get_model_conf
+# from espnet.asr.asr_utils import torch_load
+# from espnet.asr.pytorch_backend.asr import load_trained_model
+# from espnet.nets.lm_interface import dynamic_import_lm
+
+# from espnet.nets.asr_interface import ASRInterface
+
+from .utils import add_results_to_json
+# from .batch_beam_search import BatchBeamSearch
+from .beam_search import BeamSearch
+from .scorer_interface import BatchScorerInterface
+from .scorers.length_bonus import LengthBonus
+
+from deepspeech.io.reader import LoadInputsAndTargets
+from deepspeech.utils.log import Log
+logger = Log(__name__).getlog()
+
+
+def recog_v2(args):
+    """Decode with custom models that implements ScorerInterface.
+
+    Args:
+        args (namespace): The program arguments.
+        See py:func:`bin.asr_recog.get_parser` for details
+
+    """
+    logger.warning("experimental API for custom LMs is selected by --api v2")
+    if args.batchsize > 1:
+        raise NotImplementedError("multi-utt batch decoding is not implemented")
+    if args.streaming_mode is not None:
+        raise NotImplementedError("streaming mode is not implemented")
+    if args.word_rnnlm:
+        raise NotImplementedError("word LM is not implemented")
+
+    # set_deterministic(args)
+    model, train_args = load_trained_model(args.model)
+    # assert isinstance(model, ASRInterface)
+    model.eval()
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=False,
+        sort_in_input_length=False,
+        preprocess_conf=train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf,
+        preprocess_args={"train": False},
+    )
+
+    if args.rnnlm:
+        lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
+        # NOTE: for a compatibility with less than 0.5.0 version models
+        lm_model_module = getattr(lm_args, "model_module", "default")
+        lm_class = dynamic_import_lm(lm_model_module, lm_args.backend)
+        lm = lm_class(len(train_args.char_list), lm_args)
+        torch_load(args.rnnlm, lm)
+        lm.eval()
+    else:
+        lm = None
+
+    if args.ngram_model:
+        from .scorers.ngram import NgramFullScorer
+        from .scorers.ngram import NgramPartScorer
+
+        if args.ngram_scorer == "full":
+            ngram = NgramFullScorer(args.ngram_model, train_args.char_list)
+        else:
+            ngram = NgramPartScorer(args.ngram_model, train_args.char_list)
+    else:
+        ngram = None
+
+    scorers = model.scorers()
+    scorers["lm"] = lm
+    scorers["ngram"] = ngram
+    scorers["length_bonus"] = LengthBonus(len(train_args.char_list))
+    weights = dict(
+        decoder=1.0 - args.ctc_weight,
+        ctc=args.ctc_weight,
+        lm=args.lm_weight,
+        ngram=args.ngram_weight,
+        length_bonus=args.penalty,
+    )
+    beam_search = BeamSearch(
+        beam_size=args.beam_size,
+        vocab_size=len(train_args.char_list),
+        weights=weights,
+        scorers=scorers,
+        sos=model.sos,
+        eos=model.eos,
+        token_list=train_args.char_list,
+        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
+    )
+    # TODO(karita): make all scorers batchfied
+    if args.batchsize == 1:
+        non_batch = [
+            k
+            for k, v in beam_search.full_scorers.items()
+            if not isinstance(v, BatchScorerInterface)
+        ]
+        if len(non_batch) == 0:
+            beam_search.__class__ = BatchBeamSearch
+            logger.info("BatchBeamSearch implementation is selected.")
+        else:
+            logger.warning(
+                f"As non-batch scorers {non_batch} are found, "
+                f"fall back to non-batch implementation."
+            )
+
+    if args.ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+    if args.ngpu == 1:
+        device = "gpu:0"
+    else:
+        device = "cpu"
+    dtype = getattr(paddle, args.dtype)
+    logger.info(f"Decoding device={device}, dtype={dtype}")
+    model.to(device=device, dtype=dtype)
+    model.eval()
+    beam_search.to(device=device, dtype=dtype)
+    beam_search.eval()
+
+    # read json data
+    with open(args.recog_json, "rb") as f:
+        js = json.load(f)
+    # josnlines to dict, key by 'utt'
+    js = {item['utt']: item for item in js}
+
+    new_js = {}
+    with paddle.no_grad():
+        for idx, name in enumerate(js.keys(), 1):
+            logger.info("(%d/%d) decoding " + name, idx, len(js.keys()))
+            batch = [(name, js[name])]
+            feat = load_inputs_and_targets(batch)[0][0]
+            enc = model.encode(paddle.to_tensor(feat).to(device=device, dtype=dtype))
+            nbest_hyps = beam_search(
+                x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio
+            )
+            nbest_hyps = [
+                h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)]
+            ]
+            new_js[name] = add_results_to_json(
+                js[name], nbest_hyps, train_args.char_list
+            )
+
+    with open(args.result_label, "wb") as f:
+        f.write(
+            json.dumps(
+                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )
diff --git a/deepspeech/decoders/utils.py b/deepspeech/decoders/utils.py
index 92f65814..0281a78b 100644
--- a/deepspeech/decoders/utils.py
+++ b/deepspeech/decoders/utils.py
@@ -47,3 +47,78 @@ def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
         return True
     else:
         return False
+
+
+# * ------------------ recognition related ------------------ *
+def parse_hypothesis(hyp, char_list):
+    """Parse hypothesis.
+
+    Args:
+        hyp (list[dict[str, Any]]): Recognition hypothesis.
+        char_list (list[str]): List of characters.
+
+    Returns:
+        tuple(str, str, str, float)
+
+    """
+    # remove sos and get results
+    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
+    token_as_list = [char_list[idx] for idx in tokenid_as_list]
+    score = float(hyp["score"])
+
+    # convert to string
+    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
+    token = " ".join(token_as_list)
+    text = "".join(token_as_list).replace("<space>", " ")
+
+    return text, token, tokenid, score
+
+
+def add_results_to_json(js, nbest_hyps, char_list):
+    """Add N-best results to json.
+
+    Args:
+        js (dict[str, Any]): Groundtruth utterance dict.
+        nbest_hyps_sd (list[dict[str, Any]]):
+            List of hypothesis for multi_speakers: nutts x nspkrs.
+        char_list (list[str]): List of characters.
+
+    Returns:
+        dict[str, Any]: N-best results added utterance dict.
+
+    """
+    # copy old json info
+    new_js = dict()
+    new_js["utt2spk"] = js["utt2spk"]
+    new_js["output"] = []
+
+    for n, hyp in enumerate(nbest_hyps, 1):
+        # parse hypothesis
+        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list)
+
+        # copy ground-truth
+        if len(js["output"]) > 0:
+            out_dic = dict(js["output"][0].items())
+        else:
+            # for no reference case (e.g., speech translation)
+            out_dic = {"name": ""}
+
+        # update name
+        out_dic["name"] += "[%d]" % n
+
+        # add recognition results
+        out_dic["rec_text"] = rec_text
+        out_dic["rec_token"] = rec_token
+        out_dic["rec_tokenid"] = rec_tokenid
+        out_dic["score"] = score
+
+        # add to list of N-best result dicts
+        new_js["output"].append(out_dic)
+
+        # show 1-best result
+        if n == 1:
+            if "text" in out_dic.keys():
+                logging.info("groundtruth: %s" % out_dic["text"])
+            logging.info("prediction : %s" % out_dic["rec_text"])
+
+    return new_js
\ No newline at end of file
diff --git a/deepspeech/models/asr_interface.py b/deepspeech/models/asr_interface.py
new file mode 100644
index 00000000..eb820fc0
--- /dev/null
+++ b/deepspeech/models/asr_interface.py
@@ -0,0 +1,148 @@
+"""ASR Interface module."""
+import argparse
+
+from deepspeech.utils.dynamic_import import dynamic_import
+
+
+class ASRInterface:
+    """ASR Interface for ESPnet model implementation."""
+
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments to parser."""
+        return parser
+
+    @classmethod
+    def build(cls, idim: int, odim: int, **kwargs):
+        """Initialize this class with python-level args.
+
+        Args:
+            idim (int): The number of an input feature dim.
+            odim (int): The number of output vocab.
+
+        Returns:
+            ASRinterface: A new instance of ASRInterface.
+
+        """
+        args = argparse.Namespace(**kwargs)
+        return cls(idim, odim, args)
+
+    def forward(self, xs, ilens, ys, olens):
+        """Compute loss for training.
+
+        :param xs: batch of padded source sequences paddle.Tensor (B, Tmax, idim)
+        :param ilens: batch of lengths of source sequences (B), paddle.Tensor
+        :param ys: batch of padded target sequences paddle.Tensor (B, Lmax)
+        :param olens: batch of lengths of target sequences (B), paddle.Tensor
+        :return: loss value
+        :rtype: paddle.Tensor
+        """
+        raise NotImplementedError("forward method is not implemented")
+
+    def recognize(self, x, recog_args, char_list=None, rnnlm=None):
+        """Recognize x for evaluation.
+
+        :param ndarray x: input acouctic feature (B, T, D) or (T, D)
+        :param namespace recog_args: argment namespace contraining options
+        :param list char_list: list of characters
+        :param paddle.nn.Layer rnnlm: language model module
+        :return: N-best decoding results
+        :rtype: list
+        """
+        raise NotImplementedError("recognize method is not implemented")
+
+    def recognize_batch(self, x, recog_args, char_list=None, rnnlm=None):
+        """Beam search implementation for batch.
+
+        :param paddle.Tensor x: encoder hidden state sequences (B, Tmax, Henc)
+        :param namespace recog_args: argument namespace containing options
+        :param list char_list: list of characters
+        :param paddle.nn.Module rnnlm: language model module
+        :return: N-best decoding results
+        :rtype: list
+        """
+        raise NotImplementedError("Batch decoding is not supported yet.")
+
+    def calculate_all_attentions(self, xs, ilens, ys):
+        """Calculate attention.
+
+        :param list xs: list of padded input sequences [(T1, idim), (T2, idim), ...]
+        :param ndarray ilens: batch of lengths of input sequences (B)
+        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
+        :return: attention weights (B, Lmax, Tmax)
+        :rtype: float ndarray
+        """
+        raise NotImplementedError("calculate_all_attentions method is not implemented")
+
+    def calculate_all_ctc_probs(self, xs, ilens, ys):
+        """Calculate CTC probability.
+
+        :param list xs_pad: list of padded input sequences [(T1, idim), (T2, idim), ...]
+        :param ndarray ilens: batch of lengths of input sequences (B)
+        :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...]
+        :return: CTC probabilities (B, Tmax, vocab)
+        :rtype: float ndarray
+        """
+        raise NotImplementedError("calculate_all_ctc_probs method is not implemented")
+
+    @property
+    def attention_plot_class(self):
+        """Get attention plot class."""
+        from espnet.asr.asr_utils import PlotAttentionReport
+
+        return PlotAttentionReport
+
+    @property
+    def ctc_plot_class(self):
+        """Get CTC plot class."""
+        from espnet.asr.asr_utils import PlotCTCReport
+
+        return PlotCTCReport
+
+    def get_total_subsampling_factor(self):
+        """Get total subsampling factor."""
+        raise NotImplementedError(
+            "get_total_subsampling_factor method is not implemented"
+        )
+
+    def encode(self, feat):
+        """Encode feature in `beam_search` (optional).
+
+        Args:
+            x (numpy.ndarray): input feature (T, D)
+        Returns:
+            paddle.Tensor: encoded feature (T, D)
+        """
+        raise NotImplementedError("encode method is not implemented")
+
+    def scorers(self):
+        """Get scorers for `beam_search` (optional).
+
+        Returns:
+            dict[str, ScorerInterface]: dict of `ScorerInterface` objects
+
+        """
+        raise NotImplementedError("decoders method is not implemented")
+
+
+predefined_asr = {
+        "transformer": "deepspeech.models.u2:E2E",
+        "conformer": "deepspeech.models.u2:E2E",
+}
+
+def dynamic_import_asr(module, name):
+    """Import ASR models dynamically.
+
+    Args:
+        module (str): module_name:class_name or alias in `predefined_asr`
+        name (str): asr name. e.g., transformer, conformer
+
+    Returns:
+        type: ASR class
+
+    """
+    model_class = dynamic_import(module, predefined_asr.get(name, ""))
+    assert issubclass(
+        model_class, ASRInterface
+    ), f"{module} does not implement ASRInterface"
+    return model_class
diff --git a/deepspeech/models/u2/u2.py b/deepspeech/models/u2/u2.py
index fd63fa9c..8915cbd7 100644
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@@ -49,13 +49,15 @@ from deepspeech.utils.tensor_utils import pad_sequence
 from deepspeech.utils.tensor_utils import th_accuracy
 from deepspeech.utils.utility import log_add
 from deepspeech.utils.utility import UpdateConfig
+from deepspeech.models.asr_interface import ASRInterface
+from deepspeech.decoders.scorers.ctc_prefix_score import CTCPrefixScorer
 
 __all__ = ["U2Model", "U2InferModel"]
 
 logger = Log(__name__).getlog()
 
 
-class U2BaseModel(nn.Layer):
+class U2BaseModel(ASRInterface, nn.Layer):
     """CTC-Attention hybrid Encoder-Decoder model"""
 
     @classmethod
@@ -120,7 +122,7 @@ class U2BaseModel(nn.Layer):
                  **kwargs):
         assert 0.0 <= ctc_weight <= 1.0, ctc_weight
 
-        super().__init__()
+        nn.Layer.__init__(self)
         # note that eos is the same as sos (equivalent ID)
         self.sos = vocab_size - 1
         self.eos = vocab_size - 1
@@ -813,7 +815,27 @@ class U2BaseModel(nn.Layer):
         return res, res_tokenids
 
 
-class U2Model(U2BaseModel):
+class U2DecodeModel(U2BaseModel):
+
+    def scorers(self):
+        """Scorers."""
+        return dict(decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos))
+
+    def encode(self, x):
+        """Encode acoustic features.
+
+        :param ndarray x: source acoustic feature (T, D)
+        :return: encoder outputs
+        :rtype: paddle.Tensor
+        """
+        self.eval()
+        x = paddle.to_tensor(x).unsqueeze(0)
+        ilen = x.size(1)
+        enc_output, _ = self._forward_encoder(x, ilen)
+        return enc_output.squeeze(0)
+
+
+class U2Model(U2DecodeModel):
     def __init__(self, configs: dict):
         vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs)
 
diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py
index 1ae3ce37..154b7390 100644
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@@ -15,6 +15,7 @@
 from typing import List
 from typing import Optional
 from typing import Tuple
+from typing import Any
 
 import paddle
 from paddle import nn
@@ -25,7 +26,9 @@ from deepspeech.modules.decoder_layer import DecoderLayer
 from deepspeech.modules.embedding import PositionalEncoding
 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.modules.mask import subsequent_mask
+from deepspeech.modules.mask import make_xs_mask
 from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward
+from deepspeech.decoders.scorers.score_interface import BatchScorerInterface
 from deepspeech.utils.log import Log
 
 logger = Log(__name__).getlog()
@@ -33,7 +36,7 @@ logger = Log(__name__).getlog()
 __all__ = ["TransformerDecoder"]
 
 
-class TransformerDecoder(nn.Layer):
+class TransformerDecoder(BatchScorerInterface, nn.Layer):
     """Base class of Transfomer decoder module.
     Args:
         vocab_size: output dim
@@ -71,7 +74,8 @@ class TransformerDecoder(nn.Layer):
             concat_after: bool=False, ):
 
         assert check_argument_types()
-        super().__init__()
+        nn.Layer.__init__(self)
+        self.selfattention_layer_type = 'selfattn'
         attention_dim = encoder_output_size
 
         if input_layer == "embed":
@@ -180,3 +184,64 @@ class TransformerDecoder(nn.Layer):
         if self.use_output_layer:
             y = paddle.log_softmax(self.output_layer(y), axis=-1)
         return y, new_cache
+
+    # beam search API (see ScorerInterface)
+    def score(self, ys, state, x):
+        """Score.
+        ys: (ylen,)
+        x: (xlen, n_feat)
+        """
+        ys_mask = subsequent_mask(len(ys)).unsqueeze(0)
+        x_mask = make_xs_mask(x.unsqueeze(0))
+        if self.selfattention_layer_type != "selfattn":
+            # TODO(karita): implement cache
+            logging.warning(
+                f"{self.selfattention_layer_type} does not support cached decoding."
+            )
+            state = None
+        logp, state = self.forward_one_step(
+            x.unsqueeze(0), x_mask, 
+            ys.unsqueeze(0), ys_mask,
+            cache=state
+        )
+        return logp.squeeze(0), state
+
+    # batch beam search API (see BatchScorerInterface)
+    def batch_score(
+        self, ys: paddle.Tensor, states: List[Any], xs: paddle.Tensor
+    ) -> Tuple[paddle.Tensor, List[Any]]:
+        """Score new token batch (required).
+
+        Args:
+            ys (paddle.Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (paddle.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+
+        Returns:
+            tuple[paddle.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+
+        """
+        # merge states
+        n_batch = len(ys)
+        n_layers = len(self.decoders)
+        if states[0] is None:
+            batch_state = None
+        else:
+            # transpose state of [batch, layer] into [layer, batch]
+            batch_state = [
+                paddle.stack([states[b][i] for b in range(n_batch)])
+                for i in range(n_layers)
+            ]
+
+        # batch decoding
+        ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0)
+
+        xs_mask = make_xs_mask(xs)
+        logp, states = self.forward_one_step(xs, xs_mask, ys, ys_mask, cache=batch_state)
+
+        # transpose state of [layer, batch] into [batch, layer]
+        state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)]
+        return logp, state_list
diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py
index 00f228a2..cffa10a7 100644
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@@ -18,12 +18,24 @@ from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 
 __all__ = [
-    "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
+    "make_xs_mask", "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
     "subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores",
     "mask_finished_preds"
 ]
 
 
+def make_xs_mask(xs:paddle.Tensor) -> paddle.Tensor:
+    """Maks mask tensor containing indices of non-padded part.
+    Args:
+        xs (paddle.Tensor): (B, T, D), zeros for pad.
+    Returns:
+        paddle.Tensor: Mask Tensor indices of non-padded part. (B, T, D)
+    """
+    pad_frame = paddle.zeros([1, 1, xs.shape[-1]], dtype=xs.dtype)
+    mask = xs != pad_frame
+    return mask
+
+
 def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
     """Make mask tensor containing indices of padded part.
     See description of make_non_pad_mask.
@@ -31,6 +43,7 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
         lengths (paddle.Tensor): Batch of lengths (B,).
     Returns:
         paddle.Tensor: Mask tensor containing indices of padded part.
+        (B, T)
     Examples:
         >>> lengths = [5, 3, 2]
         >>> make_pad_mask(lengths)
@@ -62,6 +75,7 @@ def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
         lengths (paddle.Tensor): Batch of lengths (B,).
     Returns:
         paddle.Tensor: mask tensor containing indices of padded part.
+        (B, T)
     Examples:
         >>> lengths = [5, 3, 2]
         >>> make_non_pad_mask(lengths)