test can run

5 years ago · e5347c48ef
parent abd2f5891b
commit e5347c48ef
15 changed files with 440 additions and 86 deletions
--- a/.notebook/python_test.ipynb
+++ b/.notebook/python_test.ipynb
@ -617,10 +617,52 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 58,
   "id": "pregnant-modem",
   "metadata": {},
   "outputs": [],
+   "source": [
+    "def get_default_args(fn):\n",
+    "    if fn is None:\n",
+    "        return {}\n",
+    "\n",
+    "    signature = inspect.signature(fn)\n",
+    "    return {\n",
+    "        k: v.default\n",
+    "        for k, v in signature.parameters.items()\n",
+    "        if v.default is not inspect.Parameter.empty\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "light-drill",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'inspect' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-59-8f782d06097f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_default_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m<ipython-input-58-cb5a8794f39f>\u001b[0m in \u001b[0;36mget_default_args\u001b[0;34m(fn)\u001b[0m\n\u001b[1;32m      3\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0msignature\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minspect\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msignature\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     return {\n\u001b[1;32m      7\u001b[0m         \u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefault\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'inspect' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "get_default_args(io.open)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "protective-belgium",
+   "metadata": {},
+   "outputs": [],
   "source": []
  }
 ],
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -179,6 +179,11 @@ if not hasattr(paddle.Tensor, 'eq'):
    )
    paddle.Tensor.eq = eq

+if not hasattr(paddle, 'eq'):
+    logger.warn(
+        "override eq of paddle if exists or register, remove this when fixed!")
+    paddle.eq = eq
+

 def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
    return xs
@ -256,13 +261,14 @@ if not hasattr(paddle.Tensor, 'masked_fill'):

 def masked_fill_(xs: paddle.Tensor,
                 mask: paddle.Tensor,
-                 value: Union[float, int]):
+                 value: Union[float, int]) -> paddle.Tensor:
    assert is_broadcastable(xs.shape, mask.shape) is True
    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
    mask = mask.broadcast_to(bshape)
    trues = paddle.ones_like(xs) * value
    ret = paddle.where(mask, trues, xs)
    paddle.assign(ret.detach(), output=xs)
+    return xs


 if not hasattr(paddle.Tensor, 'masked_fill_'):
@ -271,9 +277,10 @@ if not hasattr(paddle.Tensor, 'masked_fill_'):
    paddle.Tensor.masked_fill_ = masked_fill_


-def fill_(xs: paddle.Tensor, value: Union[float, int]):
+def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
    val = paddle.full_like(xs, value)
    paddle.assign(val.detach(), output=xs)
+    return xs


 if not hasattr(paddle.Tensor, 'fill_'):
@ -317,7 +324,7 @@ if not hasattr(paddle.Tensor, 'type_as'):

 def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
    assert len(args) == 1
-    if isinstace(args[0], str):  # dtype
+    if isinstance(args[0], str):  # dtype
        return x.astype(args[0])
    elif isinstance(args[0], paddle.Tensor):  #Tensor
        return x.astype(args[0].dtype)
@ -338,6 +345,16 @@ if not hasattr(paddle.Tensor, 'float'):
    logger.warn("register user float to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'float', func_float)

+
+def tolist(x: paddle.Tensor) -> List[Any]:
+    return x.numpy().tolist()
+
+
+if not hasattr(paddle.Tensor, 'tolist'):
+    logger.warn(
+        "register user tolist to paddle.Tensor, remove this when fixed!")
+    setattr(paddle.Tensor, 'tolist', tolist)
+
 ########### hcak paddle.nn.functional #############


--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -13,6 +13,7 @@
 # limitations under the License.
 """Contains U2 model."""

+import sys
 import time
 import logging
 import numpy as np
@ -256,11 +257,19 @@ class U2Tester(U2Trainer):
                cutoff_prob=1.0,  # Cutoff probability for pruning.
                cutoff_top_n=40,  # Cutoff number for pruning.
                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
+                decoding_method='attention',  # Decoding method. Options: 'attention', 'ctc_greedy_search',
+                # 'ctc_prefix_beam_search', 'attention_rescoring'
                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
                num_proc_bsearch=8,  # # of CPUs for beam search.
-                beam_size=500,  # Beam search width.
-                batch_size=128,  # decoding batch size
+                beam_size=10,  # Beam search width.
+                batch_size=16,  # decoding batch size
+                ctc_weight=0.0,  # ctc weight for attention rescoring decode mode.
+                decoding_chunk_size=-1,  # decoding chunk size. Defaults to -1.
+                # <0: for decoding, use full chunk.
+                # >0: for decoding, use fixed chunk size as set.
+                # 0: used for training, it's prohibited here. 
+                num_decoding_left_chunks=-1,  # number of left chunks for decoding. Defaults to -1.
+                simulate_streaming=False,  # simulate streaming inference. Defaults to False.
            ))

        if config is not None:
@ -279,19 +288,19 @@ class U2Tester(U2Trainer):
            trans.append(''.join([chr(i) for i in ids]))
        return trans

-    def compute_metrics(self, audio, texts, audio_len, texts_len):
+    def compute_metrics(self, audio, audio_len, texts, texts_len, fout=None):
        cfg = self.config.decoding
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer

-        vocab_list = self.test_loader.dataset.vocab_list
+        text_feature = self.test_loader.dataset.text_feature

        target_transcripts = self.ordid2token(texts, texts_len)
        result_transcripts = self.model.decode(
            audio,
            audio_len,
-            vocab_list,
+            text_feature=text_feature,
            decoding_method=cfg.decoding_method,
            lang_model_path=cfg.lang_model_path,
            beam_alpha=cfg.alpha,
@ -299,13 +308,19 @@ class U2Tester(U2Trainer):
            beam_size=cfg.beam_size,
            cutoff_prob=cfg.cutoff_prob,
            cutoff_top_n=cfg.cutoff_top_n,
-            num_processes=cfg.num_proc_bsearch)
+            num_processes=cfg.num_proc_bsearch,
+            ctc_weight=cfg.ctc_weight,
+            decoding_chunk_size=cfg.decoding_chunk_size,
+            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
+            simulate_streaming=cfg.simulate_streaming)

        for target, result in zip(target_transcripts, result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
            num_ins += 1
+            if fout:
+                fout.write(result + "\n")
            self.logger.info(
                "\nTarget Transcription: %s\nOutput Transcription: %s" %
                (target, result))
@ -322,6 +337,7 @@ class U2Tester(U2Trainer):
    @mp_tools.rank_zero_only
    @paddle.no_grad()
    def test(self):
+        assert self.args.result_file
        self.model.eval()
        self.logger.info(
            f"Test Total Examples: {len(self.test_loader.dataset)}")
@ -329,13 +345,15 @@ class U2Tester(U2Trainer):
        error_rate_type = None
        errors_sum, len_refs, num_ins = 0.0, 0, 0

+        with open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
-            metrics = self.compute_metrics(*batch)
+                metrics = self.compute_metrics(*batch, fout=fout)
                errors_sum += metrics['errors_sum']
                len_refs += metrics['len_refs']
                num_ins += metrics['num_ins']
                error_rate_type = metrics['error_rate_type']
-            self.logger.info("Error rate [%s] (%d/?) = %f" %
+                self.logger.info(
+                    "Error rate [%s] (%d/?) = %f" %
                    (error_rate_type, num_ins, errors_sum / len_refs))

        # logging
@ -351,24 +369,34 @@ class U2Tester(U2Trainer):
        try:
            self.test()
        except KeyboardInterrupt:
-            exit(-1)
+            sys.exit(-1)

-    def export(self):
+    def load_inferspec(self):
+        """infer model and input spec.
+
+        Returns:
+            nn.Layer: inference model
+            List[paddle.static.InputSpec]: input spec.
+        """
        from deepspeech.models.u2 import U2InferModel
        infer_model = U2InferModel.from_pretrained(self.test_loader.dataset,
                                                   self.config.model.clone(),
                                                   self.args.checkpoint_path)
-        infer_model.eval()
        feat_dim = self.test_loader.dataset.feature_size
-        static_model = paddle.jit.to_static(
-            infer_model,
-            input_spec=[
+        input_spec = [
            paddle.static.InputSpec(
                shape=[None, feat_dim, None],
                dtype='float32'),  # audio, [B,D,T]
            paddle.static.InputSpec(shape=[None],
                                    dtype='int64'),  # audio_length, [B]
-            ])
+        ]
+        return infer_model, input_spec
+
+    def export(self):
+        infer_model, input_spec = self.load_inferspec()
+        assert isinstance(input_spec, list), type(input_spec)
+        infer_model.eval()
+        static_model = paddle.jit.to_static(infer_model, input_spec=input_spec)
        logger.info(f"Export code: {static_model.forward.code}")
        paddle.jit.save(static_model, self.args.export_path)

@ -376,7 +404,7 @@ class U2Tester(U2Trainer):
        try:
            self.export()
        except KeyboardInterrupt:
-            exit(-1)
+            sys.exit(-1)

    def setup(self):
        """Setup the experiment.
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@ -107,8 +107,8 @@ class SpeechFeaturizer(object):
    def vocab_size(self):
        """Return the vocabulary size.

-        :return: Vocabulary size.
-        :rtype: int
+        Returns:
+            int: Vocabulary size.
        """
        return self._text_featurizer.vocab_size

@ -116,16 +116,34 @@ class SpeechFeaturizer(object):
    def vocab_list(self):
        """Return the vocabulary in list.

-        :return: Vocabulary in list.
-        :rtype: list
+        Returns:
+            List[str]: 
        """
        return self._text_featurizer.vocab_list

+    @property
+    def vocab_dict(self):
+        """Return the vocabulary in dict.
+
+        Returns:
+            Dict[str, int]: 
+        """
+        return self._text_featurizer.vocab_dict
+
    @property
    def feature_size(self):
        """Return the audio feature size.

-        :return: audio feature size.
-        :rtype: int
+        Returns:
+            int: audio feature size.
        """
        return self._audio_featurizer.feature_size
+
+    @property
+    def text_feature(self):
+        """Return the text feature object.
+
+        Returns:
+            TextFeaturizer: object.
+        """
+        return self._text_featurizer
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -16,6 +16,7 @@
 import sentencepiece as spm

 from deepspeech.frontend.utility import UNK
+from deepspeech.frontend.utility import EOS


 class TextFeaturizer(object):
@ -32,10 +33,12 @@ class TextFeaturizer(object):
            spm_model_prefix (str, optional): spm model prefix. Defaults to None.
        """
        assert unit_type in ('char', 'spm', 'word')
-        self.unk = UNK
        self.unit_type = unit_type
-        self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
+        self._vocab_dict, self._id2token, self._vocab_list = self._load_vocabulary_from_file(
            vocab_filepath)
+        self.unk = UNK
+        self.unk_id = self._vocab_list.index(self.unk)
+        self.eos_id = self._vocab_list.index(EOS)

        if unit_type == 'spm':
            spm_model = spm_model_prefix + '.model'
@ -51,14 +54,23 @@ class TextFeaturizer(object):
            tokens = self.spm_tokenize(text)
        return tokens

+    def detokenize(self, tokens):
+        if self.unit_type == 'char':
+            text = self.char_detokenize(tokens)
+        elif self.unit_type == 'word':
+            text = self.word_detokenize(tokens)
+        else:  # spm
+            text = self.spm_detokenize(tokens)
+        return text
+
    def featurize(self, text):
-        """Convert text string to a list of token indices in char-level.Note
-        that the token indexing order follows the given vocabulary file.
+        """Convert text string to a list of token indices.

-        :param text: Text to process.
-        :type text: str
-        :return: List of char-level token indices.
-        :rtype: List[int]
+        Args:
+            text (str): Text to process.
+        
+        Returns:
+            List[int]: List of token indices.
        """
        tokens = self.tokenize(text)
        ids = []
@ -67,6 +79,24 @@ class TextFeaturizer(object):
            ids.append(self._vocab_dict[token])
        return ids

+    def defeaturize(self, idxs):
+        """Convert a list of token indices to text string,
+        ignore index after eos_id. 
+
+        Args:
+            idxs (List[int]): List of token indices.
+
+        Returns:
+            str: Text to process.
+        """
+        tokens = []
+        for idx in idxs:
+            if idx == self.eos_id:
+                break
+            tokens.append(self._id2token[idx])
+        text = self.detokenize(tokens)
+        return text
+
    @property
    def vocab_size(self):
        """Return the vocabulary size.
@ -80,19 +110,50 @@ class TextFeaturizer(object):
    def vocab_list(self):
        """Return the vocabulary in list.

-        :return: Vocabulary in list.
-        :rtype: list
+        Returns:
+            List[str]: tokens.
        """
        return self._vocab_list

+    @property
+    def vocab_dict(self):
+        """Return the vocabulary in dict.
+
+        Returns:
+            Dict[str, int]: token str -> int
+        """
+        return self._vocab_dict
+
    def char_tokenize(self, text):
-        """Character tokenizer."""
+        """Character tokenizer.
+
+        Args:
+            text (str): text string.
+
+        Returns:
+            List[str]: tokens.
+        """
        return list(text.strip())

+    def char_detokenize(self, tokens):
+        """Character detokenizer.
+
+        Args:
+            tokens (List[str]): tokens.
+
+        Returns:
+           str: text string.
+        """
+        return "".join(tokens)
+
    def word_tokenize(self, text):
-        """Word tokenizer, spearte by <space>."""
+        """Word tokenizer, separate by <space>."""
        return text.strip().split()

+    def word_detokenize(self, tokens):
+        """Word detokenizer, separate by <space>."""
+        return " ".join(tokens)
+
    def spm_tokenize(self, text):
        """spm tokenize.

@ -125,12 +186,34 @@ class TextFeaturizer(object):
        enc_line = encode_line(text)
        return enc_line

+    def spm_detokenize(self, tokens, input_format='piece'):
+        """spm detokenize.
+
+        Args:
+            ids (List[str]): tokens.
+
+        Returns:
+            str: text
+        """
+        if input_format == "piece":
+
+            def decode(l):
+                return "".join(self.sp.DecodePieces(l))
+        elif input_format == "id":
+
+            def decode(l):
+                return "".join(self.sp.DecodeIds(l))
+
+        return decode(tokens)
+
    def _load_vocabulary_from_file(self, vocab_filepath):
        """Load vocabulary from file."""
        vocab_lines = []
        with open(vocab_filepath, 'r', encoding='utf-8') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
-        vocab_dict = dict(
-            [(token, id) for (id, token) in enumerate(vocab_list)])
-        return vocab_dict, vocab_list
+        id2token = dict(
+            [(idx, token) for (idx, token) in enumerate(vocab_list)])
+        token2id = dict(
+            [(token, idx) for (idx, token) in enumerate(vocab_list)])
+        return token2id, id2token, vocab_list
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -233,22 +233,20 @@ class ManifestDataset(Dataset):

    @property
    def vocab_size(self):
-        """Return the vocabulary size.
-
-        :return: Vocabulary size.
-        :rtype: int
-        """
        return self._speech_featurizer.vocab_size

    @property
    def vocab_list(self):
-        """Return the vocabulary in list.
-
-        :return: Vocabulary in list.
-        :rtype: list
-        """
        return self._speech_featurizer.vocab_list

+    @property
+    def vocab_dict(self):
+        return self._speech_featurizer.vocab_dict
+
+    @property
+    def text_feature(self):
+        return self._speech_featurizer.text_feature
+
    @property
    def feature_size(self):
        return self._speech_featurizer.feature_size
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@ -16,10 +16,11 @@ Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recogni
 (https://arxiv.org/pdf/2012.05481.pdf)
 """

+import sys
 from collections import defaultdict
 import logging
 from yacs.config import CfgNode
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Dict

 import paddle
 from paddle import jit
@ -132,6 +133,7 @@ class U2BaseModel(nn.Module):
            smoothing=lsm_weight,
            normalize_length=length_normalized_loss, )

+    @jit.export
    def forward(
            self,
            speech: paddle.Tensor,
@ -158,7 +160,7 @@ class U2BaseModel(nn.Module):
        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
        #TODO(Hui Zhang): sum not support bool type
        #encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
-        encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int64).sum(
+        encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
            1)  #[B, 1, T] -> [B]

        # 2a. Attention-decoder branch
@ -301,14 +303,15 @@ class U2BaseModel(nn.Module):
        # log scale score
        scores = paddle.to_tensor(
            [0.0] + [-float('inf')] * (beam_size - 1), dtype=paddle.float)
-        scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to(
+        scores = scores.to(device).repeat(batch_size).unsqueeze(1).to(
            device)  # (B*N, 1)
        end_flag = paddle.zeros_like(scores, dtype=paddle.bool)  # (B*N, 1)
        cache: Optional[List[paddle.Tensor]] = None
        # 2. Decoder forward step by step
        for i in range(1, maxlen + 1):
            # Stop if all batch and all beam produce eos
-            if end_flag.sum() == running_size:
+            # TODO(Hui Zhang): if end_flag.sum() == running_size:
+            if end_flag.cast(paddle.int64).sum() == running_size:
                break

            # 2.1 Forward decoder step
@ -333,7 +336,7 @@ class U2BaseModel(nn.Module):
            # regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
            # then find offset_k_index in top_k_index
            base_k_index = paddle.arange(batch_size).view(-1, 1).repeat(
-                [1, beam_size])  # (B, N)
+                1, beam_size)  # (B, N)
            base_k_index = base_k_index * beam_size * beam_size
            best_k_index = base_k_index.view(-1) + offset_k_index.view(
                -1)  # (B*N)
@ -678,6 +681,108 @@ class U2BaseModel(nn.Module):
        decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1)
        return decoder_out

+    @paddle.no_grad()
+    def decode(self,
+               feats: paddle.Tensor,
+               feats_lengths: paddle.Tensor,
+               text_feature: Dict[str, int],
+               decoding_method: str,
+               lang_model_path: str,
+               beam_alpha: float,
+               beam_beta: float,
+               beam_size: int,
+               cutoff_prob: float,
+               cutoff_top_n: int,
+               num_processes: int,
+               ctc_weight: float=0.0,
+               decoding_chunk_size: int=-1,
+               num_decoding_left_chunks: int=-1,
+               simulate_streaming: bool=False):
+        """u2 decoding.
+
+        Args:
+            feats (Tenosr): audio features, (B, T, D)
+            feats_lengths (Tenosr): (B)
+            text_feature (TextFeaturizer): text feature object.
+            decoding_method (str): decoding mode, e.g. 
+                    'attention', 'ctc_greedy_search', 
+                    'ctc_prefix_beam_search', 'attention_rescoring'
+            lang_model_path (str): lm path.
+            beam_alpha (float): lm weight.
+            beam_beta (float): length penalty.
+            beam_size (int): beam size for search
+            cutoff_prob (float): for prune.
+            cutoff_top_n (int): for prune.
+            num_processes (int): 
+            ctc_weight (float, optional): ctc weight for attention rescoring decode mode. Defaults to 0.0.
+            decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1.
+                    <0: for decoding, use full chunk.
+                    >0: for decoding, use fixed chunk size as set.
+                    0: used for training, it's prohibited here. 
+            num_decoding_left_chunks (int, optional): 
+                    number of left chunks for decoding. Defaults to -1.
+            simulate_streaming (bool, optional): simulate streaming inference. Defaults to False.
+
+        Raises:
+            ValueError: when not support decoding_method.
+        
+        Returns:
+            List[List[int]]: transcripts.
+        """
+        batch_size = feats.size(0)
+        if decoding_method in ['ctc_prefix_beam_search',
+                               'attention_rescoring'] and batch_size > 1:
+            logger.fatal(
+                f'decoding mode {decoding_method} must be running with batch_size == 1'
+            )
+            sys.exit(1)
+
+        if decoding_method == 'attention':
+            hyps = self.recognize(
+                feats,
+                feats_lengths,
+                beam_size=beam_size,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                simulate_streaming=simulate_streaming)
+            hyps = [hyp.tolist() for hyp in hyps]
+        elif decoding_method == 'ctc_greedy_search':
+            hyps = self.ctc_greedy_search(
+                feats,
+                feats_lengths,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                simulate_streaming=simulate_streaming)
+        # ctc_prefix_beam_search and attention_rescoring only return one
+        # result in List[int], change it to List[List[int]] for compatible
+        # with other batch decoding mode
+        elif decoding_method == 'ctc_prefix_beam_search':
+            assert feats.size(0) == 1
+            hyp = self.ctc_prefix_beam_search(
+                feats,
+                feats_lengths,
+                beam_size,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                simulate_streaming=simulate_streaming)
+            hyps = [hyp]
+        elif decoding_method == 'attention_rescoring':
+            assert feats.size(0) == 1
+            hyp = self.attention_rescoring(
+                feats,
+                feats_lengths,
+                beam_size,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                ctc_weight=ctc_weight,
+                simulate_streaming=simulate_streaming)
+            hyps = [hyp]
+        else:
+            raise ValueError(f"Not support decoding method: {decoding_method}")
+
+        res = [text_feature.defeaturize(hyp) for hyp in hyps]
+        return res
+

 class U2Model(U2BaseModel):
    def __init__(self, configs: dict):
@ -779,14 +884,24 @@ class U2InferModel(U2Model):
    def __init__(self, configs: dict):
        super().__init__(configs)

-    def forward(self, audio, audio_len):
+    def forward(self,
+                feats,
+                feats_lengths,
+                decoding_chunk_size=-1,
+                num_decoding_left_chunks=-1,
+                simulate_streaming=False):
        """export model function

        Args:
-            audio (Tensor): [B, T, D]
-            audio_len (Tensor): [B]
+            feats (Tensor): [B, T, D]
+            feats_lengths (Tensor): [B]

        Returns:
-            probs: probs after softmax
+            List[List[int]]: best path result
        """
-        raise NotImplementedError("U2Model infer")
+        return self.ctc_greedy_search(
+            feats,
+            feats_lengths,
+            decoding_chunk_size=decoding_chunk_size,
+            num_decoding_left_chunks=num_decoding_left_chunks,
+            simulate_streaming=simulate_streaming)
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -224,9 +224,28 @@ class CTCDecoder(nn.Layer):
    def decode_probs(self, probs, logits_lens, vocab_list, decoding_method,
                     lang_model_path, beam_alpha, beam_beta, beam_size,
                     cutoff_prob, cutoff_top_n, num_processes):
-        """ probs: activation after softmax 
-        logits_len: audio output lens
+        """ctc decoding with probs.
+
+        Args:
+            probs (Tenosr): activation after softmax 
+            logits_lens (Tenosr): audio output lens
+            vocab_list ([type]): [description]
+            decoding_method ([type]): [description]
+            lang_model_path ([type]): [description]
+            beam_alpha ([type]): [description]
+            beam_beta ([type]): [description]
+            beam_size ([type]): [description]
+            cutoff_prob ([type]): [description]
+            cutoff_top_n ([type]): [description]
+            num_processes ([type]): [description]
+
+        Raises:
+            ValueError: when decoding_method not support.
+
+        Returns:
+            List[str]: transcripts.
        """
+
        probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)]
        if decoding_method == "ctc_greedy":
            result_transcripts = self._decode_batch_greedy(
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@ -176,5 +176,5 @@ class TransformerDecoder(nn.Module):
        else:
            y = x[:, -1]
        if self.use_output_layer:
-            y = paddle.log_softmax(self.output_layer(y), dim=-1)
+            y = paddle.log_softmax(self.output_layer(y), axis=-1)
        return y, new_cache
--- a/deepspeech/modules/decoder_layer.py
+++ b/deepspeech/modules/decoder_layer.py
@ -101,12 +101,17 @@ class DecoderLayer(nn.Module):
            tgt_q_mask = tgt_mask
        else:
            # compute only the last frame query keeping dim: max_time_out -> 1
-            assert cache.shape == (
-                tgt.shape[0], tgt.shape[1] - 1, self.size,
-            ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+            assert cache.shape == [
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ], f"{cache.shape} == {[tgt.shape[0], tgt.shape[1] - 1, self.size]}"
            tgt_q = tgt[:, -1:, :]
            residual = residual[:, -1:, :]
-            tgt_q_mask = tgt_mask[:, -1:, :]
+            # TODO(Hui Zhang): slice not support bool type
+            # tgt_q_mask = tgt_mask[:, -1:, :]
+            tgt_q_mask = tgt_mask.cast(paddle.int64)[:, -1:, :].cast(
+                paddle.bool)

        if self.concat_after:
            tgt_concat = paddle.cat(
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@ -41,6 +41,7 @@ def sequence_mask(x_len, max_len=None, dtype='float32'):
        [[1., 1., 0., 0.],
         [1., 1., 1., 1.]]
    """
+    assert x_len.dim() == 1
    max_len = max_len or x_len.max()
    x_len = paddle.unsqueeze(x_len, -1)
    row_vector = paddle.arange(max_len)
@ -65,6 +66,7 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
                 [0, 0, 0, 1, 1],
                 [0, 0, 1, 1, 1]]
    """
+    assert lengths.dim() == 1
    batch_size = int(lengths.shape[0])
    max_len = int(lengths.max())
    seq_range = paddle.arange(0, max_len, dtype=paddle.int64)
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@ -57,6 +57,9 @@ def default_argument_parser():
    # save jit model to 
    parser.add_argument("--export_path", type=str, help="path of the jit model to save")

+    # save asr result to 
+    parser.add_argument("--result_file", type=str, help="path of save the asr result")
+
    # running
    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
                        help="device type to use, cpu and gpu are supported.")
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@ -126,16 +126,24 @@ training:
    lr_decay: 1.0
  log_interval: 100

+
 decoding:
-  batch_size: 128
+  batch_size: 16
  error_rate_type: wer
-  decoding_method: ctc_beam_search
+  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
-  beam_size: 500
+  beam_size: 10
  cutoff_prob: 1.0
-  cutoff_top_n: 40
+  cutoff_top_n: 0
  num_proc_bsearch: 8
+  ctc_weight: 0.0 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.


--- a/examples/tiny/s1/local/test.sh
+++ b/examples/tiny/s1/local/test.sh
@ -11,6 +11,7 @@ python3 -u ${BIN_DIR}/test.py \
 --device 'gpu' \
 --nproc 1 \
 --config conf/conformer.yaml \
+--result_file data/asr.result \
 --output ckpt

 if [ $? -ne 0 ]; then
--- a/tests/mask_test.py
+++ b/tests/mask_test.py
@ -17,14 +17,23 @@ import numpy as np
 import unittest
 from deepspeech.modules.mask import sequence_mask
 from deepspeech.modules.mask import make_non_pad_mask
+from deepspeech.modules.mask import make_pad_mask


 class TestU2Model(unittest.TestCase):
    def setUp(self):
        paddle.set_device('cpu')
        self.lengths = paddle.to_tensor([5, 3, 2])
-        self.masks = np.array(
-            [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [1, 1, 0, 0, 0]], )
+        self.masks = np.array([
+            [1, 1, 1, 1, 1],
+            [1, 1, 1, 0, 0],
+            [1, 1, 0, 0, 0],
+        ])
+        self.pad_masks = np.array([
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 1, 1],
+            [0, 0, 1, 1, 1],
+        ])

    def test_sequence_mask(self):
        res = sequence_mask(self.lengths)
@ -32,7 +41,13 @@ class TestU2Model(unittest.TestCase):

    def test_make_non_pad_mask(self):
        res = make_non_pad_mask(self.lengths)
+        res1 = sequence_mask(self.lengths)
        self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
+        self.assertSequenceEqual(res.numpy().tolist(), res1.numpy().tolist())
+
+    def test_make_pad_mask(self):
+        res = make_pad_mask(self.lengths)
+        self.assertSequenceEqual(res.numpy().tolist(), self.pad_masks.tolist())


 if __name__ == '__main__':