From 666b42d18b984244de13eab4de613bf6d5b811f8 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 15 Oct 2021 02:25:57 +0000 Subject: [PATCH 1/5] rename scorers --- deepspeech/decoders/{scores => scorers}/__init__.py | 0 deepspeech/decoders/{scores => scorers}/ctc.py | 0 deepspeech/decoders/{scores => scorers}/ctc_prefix_score.py | 0 deepspeech/decoders/{scores => scorers}/length_bonus.py | 0 deepspeech/decoders/{scores => scorers}/ngram.py | 0 deepspeech/decoders/{scores => scorers}/score_interface.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename deepspeech/decoders/{scores => scorers}/__init__.py (100%) rename deepspeech/decoders/{scores => scorers}/ctc.py (100%) rename deepspeech/decoders/{scores => scorers}/ctc_prefix_score.py (100%) rename deepspeech/decoders/{scores => scorers}/length_bonus.py (100%) rename deepspeech/decoders/{scores => scorers}/ngram.py (100%) rename deepspeech/decoders/{scores => scorers}/score_interface.py (100%) diff --git a/deepspeech/decoders/scores/__init__.py b/deepspeech/decoders/scorers/__init__.py similarity index 100% rename from deepspeech/decoders/scores/__init__.py rename to deepspeech/decoders/scorers/__init__.py diff --git a/deepspeech/decoders/scores/ctc.py b/deepspeech/decoders/scorers/ctc.py similarity index 100% rename from deepspeech/decoders/scores/ctc.py rename to deepspeech/decoders/scorers/ctc.py diff --git a/deepspeech/decoders/scores/ctc_prefix_score.py b/deepspeech/decoders/scorers/ctc_prefix_score.py similarity index 100% rename from deepspeech/decoders/scores/ctc_prefix_score.py rename to deepspeech/decoders/scorers/ctc_prefix_score.py diff --git a/deepspeech/decoders/scores/length_bonus.py b/deepspeech/decoders/scorers/length_bonus.py similarity index 100% rename from deepspeech/decoders/scores/length_bonus.py rename to deepspeech/decoders/scorers/length_bonus.py diff --git a/deepspeech/decoders/scores/ngram.py b/deepspeech/decoders/scorers/ngram.py similarity index 100% rename from deepspeech/decoders/scores/ngram.py rename to deepspeech/decoders/scorers/ngram.py diff --git a/deepspeech/decoders/scores/score_interface.py b/deepspeech/decoders/scorers/score_interface.py similarity index 100% rename from deepspeech/decoders/scores/score_interface.py rename to deepspeech/decoders/scorers/score_interface.py From 0f59459a664b0f6ecf87b2fc60528040910005d4 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 15 Oct 2021 02:26:15 +0000 Subject: [PATCH 2/5] add LayerDict --- deepspeech/__init__.py | 151 ++++++++++++++++++++++++++++++++++++ deepspeech/utils/utility.py | 2 +- 2 files changed, 152 insertions(+), 1 deletion(-) diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index 493f10a6..5f9ba007 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -355,6 +355,8 @@ if not hasattr(paddle.Tensor, 'tolist'): setattr(paddle.Tensor, 'tolist', tolist) + +########### hcak paddle.nn.functional ############# # hack loss def ctc_loss(logits, labels, @@ -381,3 +383,152 @@ logger.debug( "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!" ) F.ctc_loss = ctc_loss + + +########### hcak paddle.nn ############# +from paddle.nn import Layer +from typing import Optional +from typing import Mapping +from typing import Iterable +from typing import Tuple +from typing import Iterator +from collections import OrderedDict, abc as container_abcs + +class LayerDict(paddle.nn.Layer): + r"""Holds submodules in a dictionary. + + :class:`~paddle.nn.LayerDict` can be indexed like a regular Python dictionary, + but modules it contains are properly registered, and will be visible by all + :class:`~paddle.nn.Layer` methods. + + :class:`~paddle.nn.LayerDict` is an **ordered** dictionary that respects + + * the order of insertion, and + + * in :meth:`~paddle.nn.LayerDict.update`, the order of the merged + ``OrderedDict``, ``dict`` (started from Python 3.6) or another + :class:`~paddle.nn.LayerDict` (the argument to + :meth:`~paddle.nn.LayerDict.update`). + + Note that :meth:`~paddle.nn.LayerDict.update` with other unordered mapping + types (e.g., Python's plain ``dict`` before Python version 3.6) does not + preserve the order of the merged mapping. + + Args: + modules (iterable, optional): a mapping (dictionary) of (string: module) + or an iterable of key-value pairs of type (string, module) + + Example:: + + class MyModule(nn.Layer): + def __init__(self): + super(MyModule, self).__init__() + self.choices = nn.LayerDict({ + 'conv': nn.Conv2d(10, 10, 3), + 'pool': nn.MaxPool2d(3) + }) + self.activations = nn.LayerDict([ + ['lrelu', nn.LeakyReLU()], + ['prelu', nn.PReLU()] + ]) + + def forward(self, x, choice, act): + x = self.choices[choice](x) + x = self.activations[act](x) + return x + """ + + def __init__(self, modules: Optional[Mapping[str, Layer]] = None) -> None: + super(LayerDict, self).__init__() + if modules is not None: + self.update(modules) + + def __getitem__(self, key: str) -> Layer: + return self._modules[key] + + def __setitem__(self, key: str, module: Layer) -> None: + self.add_module(key, module) + + def __delitem__(self, key: str) -> None: + del self._modules[key] + + def __len__(self) -> int: + return len(self._modules) + + def __iter__(self) -> Iterator[str]: + return iter(self._modules) + + def __contains__(self, key: str) -> bool: + return key in self._modules + + def clear(self) -> None: + """Remove all items from the LayerDict. + """ + self._modules.clear() + + def pop(self, key: str) -> Layer: + r"""Remove key from the LayerDict and return its module. + + Args: + key (string): key to pop from the LayerDict + """ + v = self[key] + del self[key] + return v + + def keys(self) -> Iterable[str]: + r"""Return an iterable of the LayerDict keys. + """ + return self._modules.keys() + + def items(self) -> Iterable[Tuple[str, Layer]]: + r"""Return an iterable of the LayerDict key/value pairs. + """ + return self._modules.items() + + def values(self) -> Iterable[Layer]: + r"""Return an iterable of the LayerDict values. + """ + return self._modules.values() + + def update(self, modules: Mapping[str, Layer]) -> None: + r"""Update the :class:`~paddle.nn.LayerDict` with the key-value pairs from a + mapping or an iterable, overwriting existing keys. + + .. note:: + If :attr:`modules` is an ``OrderedDict``, a :class:`~paddle.nn.LayerDict`, or + an iterable of key-value pairs, the order of new elements in it is preserved. + + Args: + modules (iterable): a mapping (dictionary) from string to :class:`~paddle.nn.Layer`, + or an iterable of key-value pairs of type (string, :class:`~paddle.nn.Layer`) + """ + if not isinstance(modules, container_abcs.Iterable): + raise TypeError("LayerDict.update should be called with an " + "iterable of key/value pairs, but got " + + type(modules).__name__) + + if isinstance(modules, (OrderedDict, LayerDict, container_abcs.Mapping)): + for key, module in modules.items(): + self[key] = module + else: + # modules here can be a list with two items + for j, m in enumerate(modules): + if not isinstance(m, container_abcs.Iterable): + raise TypeError("LayerDict update sequence element " + "#" + str(j) + " should be Iterable; is" + + type(m).__name__) + if not len(m) == 2: + raise ValueError("LayerDict update sequence element " + "#" + str(j) + " has length " + str(len(m)) + + "; 2 is required") + # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)] + # that's too cumbersome to type correctly with overloads, so we add an ignore here + self[m[0]] = m[1] # type: ignore[assignment] + + # remove forward alltogether to fallback on Module's _forward_unimplemented + +if not hasattr(paddle.nn, 'LayerDict'): + logger.debug( + "register user LayerDict to paddle.nn, remove this when fixed!") + setattr(paddle.nn, 'LayerDict', LayerDict) diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py index ba5acbb9..8773b84c 100644 --- a/deepspeech/utils/utility.py +++ b/deepspeech/utils/utility.py @@ -42,7 +42,7 @@ def all_version(): "paddle_commit": paddle.version.commit, "soundfile": soundfile.__version__, } - logger.info(f"Deps Module Version:{pformat(vers.items())}") + logger.info(f"Deps Module Version:{pformat(list(vers.items()))}") @contextmanager From 9e6f89bde1e0dd92dc5f8644ec36b55193dffedd Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 19 Oct 2021 06:40:46 +0000 Subject: [PATCH 3/5] fix score doc --- deepspeech/decoders/scorers/score_interface.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepspeech/decoders/scorers/score_interface.py b/deepspeech/decoders/scorers/score_interface.py index 3a9c500b..366904a4 100644 --- a/deepspeech/decoders/scorers/score_interface.py +++ b/deepspeech/decoders/scorers/score_interface.py @@ -145,9 +145,11 @@ class PartialScorerInterface(ScorerInterface): and receives pre-pruned next tokens to score because it is too heavy to score all the tokens. + Score sub-set of tokens, not all. + Examples: * Prefix search for connectionist-temporal-classification models - * :class:`espnet.nets.scorers.ctc.CTCPrefixScorer` + * :class:`decoders.scorers.ctc.CTCPrefixScorer` """ From 6257eda00e6697e758d18fbc3065ece9ffd02ff7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 21 Oct 2021 07:40:41 +0000 Subject: [PATCH 4/5] add beam search with scorers --- deepspeech/decoders/beam_search.py | 528 +++++++++++++++++++++++++++++ deepspeech/io/batchfy.py | 2 +- 2 files changed, 529 insertions(+), 1 deletion(-) create mode 100644 deepspeech/decoders/beam_search.py diff --git a/deepspeech/decoders/beam_search.py b/deepspeech/decoders/beam_search.py new file mode 100644 index 00000000..5bf0d3e2 --- /dev/null +++ b/deepspeech/decoders/beam_search.py @@ -0,0 +1,528 @@ +"""Beam search module.""" + +from itertools import chain +import logger +from typing import Any +from typing import Dict +from typing import List +from typing import NamedTuple +from typing import Tuple +from typing import Union + +import paddle + +from .utils import end_detect +from .scorers.scorer_interface import PartialScorerInterface +from .scorers.scorer_interface import ScorerInterface + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +class Hypothesis(NamedTuple): + """Hypothesis data type.""" + + yseq: paddle.Tensor # (T,) + score: Union[float, paddle.Tensor] = 0 + scores: Dict[str, Union[float, paddle.Tensor]] = dict() + states: Dict[str, Any] = dict() + + def asdict(self) -> dict: + """Convert data to JSON-friendly dict.""" + return self._replace( + yseq=self.yseq.tolist(), + score=float(self.score), + scores={k: float(v) for k, v in self.scores.items()}, + )._asdict() + + +class BeamSearch(paddle.nn.Layer): + """Beam search implementation.""" + + def __init__( + self, + scorers: Dict[str, ScorerInterface], + weights: Dict[str, float], + beam_size: int, + vocab_size: int, + sos: int, + eos: int, + token_list: List[str] = None, + pre_beam_ratio: float = 1.5, + pre_beam_score_key: str = None, + ): + """Initialize beam search. + + Args: + scorers (dict[str, ScorerInterface]): Dict of decoder modules + e.g., Decoder, CTCPrefixScorer, LM + The scorer will be ignored if it is `None` + weights (dict[str, float]): Dict of weights for each scorers + The scorer will be ignored if its weight is 0 + beam_size (int): The number of hypotheses kept during search + vocab_size (int): The number of vocabulary + sos (int): Start of sequence id + eos (int): End of sequence id + token_list (list[str]): List of tokens for debug log + pre_beam_score_key (str): key of scores to perform pre-beam search + pre_beam_ratio (float): beam size in the pre-beam search + will be `int(pre_beam_ratio * beam_size)` + + """ + super().__init__() + # set scorers + self.weights = weights + self.scorers = dict() # all = full + partial + self.full_scorers = dict() # full tokens + self.part_scorers = dict() # partial tokens + # this module dict is required for recursive cast + # `self.to(device, dtype)` in `recog.py` + self.nn_dict = paddle.nn.LayerDict() # nn.Layer + for k, v in scorers.items(): + w = weights.get(k, 0) + if w == 0 or v is None: + continue + assert isinstance( + v, ScorerInterface + ), f"{k} ({type(v)}) does not implement ScorerInterface" + self.scorers[k] = v + if isinstance(v, PartialScorerInterface): + self.part_scorers[k] = v + else: + self.full_scorers[k] = v + if isinstance(v, paddle.nn.Layer): + self.nn_dict[k] = v + + # set configurations + self.sos = sos + self.eos = eos + self.token_list = token_list + # pre_beam_size > beam_size + self.pre_beam_size = int(pre_beam_ratio * beam_size) + self.beam_size = beam_size + self.n_vocab = vocab_size + if ( + pre_beam_score_key is not None + and pre_beam_score_key != "full" + and pre_beam_score_key not in self.full_scorers + ): + raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}") + # selected `key` scorer to do pre beam search + self.pre_beam_score_key = pre_beam_score_key + # do_pre_beam when need, valid and has part_scorers + self.do_pre_beam = ( + self.pre_beam_score_key is not None + and self.pre_beam_size < self.n_vocab + and len(self.part_scorers) > 0 + ) + + def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]: + """Get an initial hypothesis data. + + Args: + x (paddle.Tensor): The encoder output feature, (T, D) + + Returns: + Hypothesis: The initial hypothesis. + + """ + init_states = dict() + init_scores = dict() + for k, d in self.scorers.items(): + init_states[k] = d.init_state(x) + init_scores[k] = 0.0 + return [ + Hypothesis( + yseq=paddle.to_tensor([self.sos], place=x.place), + score=0.0, + scores=init_scores, + states=init_states, + ) + ] + + @staticmethod + def append_token(xs: paddle.Tensor, x: int) -> paddle.Tensor: + """Append new token to prefix tokens. + + Args: + xs (paddle.Tensor): The prefix token, (T,) + x (int): The new token to append + + Returns: + paddle.Tensor: (T+1,), New tensor contains: xs + [x] with xs.dtype and xs.device + + """ + x = paddle.to_tensor([x], dtype=xs.dtype, place=xs.place) + return paddle.cat((xs, x)) + + def score_full( + self, hyp: Hypothesis, x: paddle.Tensor + ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: + """Score new hypothesis by `self.full_scorers`. + + Args: + hyp (Hypothesis): Hypothesis with prefix tokens to score + x (paddle.Tensor): Corresponding input feature, (T, D) + + Returns: + Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of + score dict of `hyp` that has string keys of `self.full_scorers` + and tensor score values of shape: `(self.n_vocab,)`, + and state dict that has string keys + and state values of `self.full_scorers` + + """ + scores = dict() + states = dict() + for k, d in self.full_scorers.items(): + # scores[k] shape (self.n_vocab,) + scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x) + return scores, states + + def score_partial( + self, hyp: Hypothesis, ids: paddle.Tensor, x: paddle.Tensor + ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: + """Score new hypothesis by `self.part_scorers`. + + Args: + hyp (Hypothesis): Hypothesis with prefix tokens to score + ids (paddle.Tensor): 1D tensor of new partial tokens to score, + len(ids) < n_vocab + x (paddle.Tensor): Corresponding input feature, (T, D) + + Returns: + Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of + score dict of `hyp` that has string keys of `self.part_scorers` + and tensor score values of shape: `(len(ids),)`, + and state dict that has string keys + and state values of `self.part_scorers` + + """ + scores = dict() + states = dict() + for k, d in self.part_scorers.items(): + # scores[k] shape (len(ids),) + scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x) + return scores, states + + def beam( + self, weighted_scores: paddle.Tensor, ids: paddle.Tensor + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Compute topk full token ids and partial token ids. + + Args: + weighted_scores (paddle.Tensor): The weighted sum scores for each tokens. + Its shape is `(self.n_vocab,)`. + ids (paddle.Tensor): The partial token ids(Global) to compute topk. + + Returns: + Tuple[paddle.Tensor, paddle.Tensor]: + The topk full token ids and partial token ids. + Their shapes are `(self.beam_size,)`. + i.e. (global ids, global relative local ids). + + """ + # no pre beam performed, `ids` equal to `weighted_scores` + if weighted_scores.size(0) == ids.size(0): + top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab + return top_ids, top_ids + + # mask pruned in pre-beam not to select in topk + tmp = weighted_scores[ids] + weighted_scores[:] = -float("inf") + weighted_scores[ids] = tmp + # top_ids no equal to local_ids, since ids shape not same + top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab + local_ids = weighted_scores[ids].topk(self.beam_size)[1] # index in len(ids) + return top_ids, local_ids + + @staticmethod + def merge_scores( + prev_scores: Dict[str, float], + next_full_scores: Dict[str, paddle.Tensor], + full_idx: int, + next_part_scores: Dict[str, paddle.Tensor], + part_idx: int, + ) -> Dict[str, paddle.Tensor]: + """Merge scores for new hypothesis. + + Args: + prev_scores (Dict[str, float]): + The previous hypothesis scores by `self.scorers` + next_full_scores (Dict[str, paddle.Tensor]): scores by `self.full_scorers` + full_idx (int): The next token id for `next_full_scores` + next_part_scores (Dict[str, paddle.Tensor]): + scores of partial tokens by `self.part_scorers` + part_idx (int): The new token id for `next_part_scores` + + Returns: + Dict[str, paddle.Tensor]: The new score dict. + Its keys are names of `self.full_scorers` and `self.part_scorers`. + Its values are scalar tensors by the scorers. + + """ + new_scores = dict() + for k, v in next_full_scores.items(): + new_scores[k] = prev_scores[k] + v[full_idx] + for k, v in next_part_scores.items(): + new_scores[k] = prev_scores[k] + v[part_idx] + return new_scores + + def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any: + """Merge states for new hypothesis. + + Args: + states: states of `self.full_scorers` + part_states: states of `self.part_scorers` + part_idx (int): The new token id for `part_scores` + + Returns: + Dict[str, paddle.Tensor]: The new score dict. + Its keys are names of `self.full_scorers` and `self.part_scorers`. + Its values are states of the scorers. + + """ + new_states = dict() + for k, v in states.items(): + new_states[k] = v + for k, d in self.part_scorers.items(): + new_states[k] = d.select_state(part_states[k], part_idx) + return new_states + + def search( + self, running_hyps: List[Hypothesis], x: paddle.Tensor + ) -> List[Hypothesis]: + """Search new tokens for running hypotheses and encoded speech x. + + Args: + running_hyps (List[Hypothesis]): Running hypotheses on beam + x (paddle.Tensor): Encoded speech feature (T, D) + + Returns: + List[Hypotheses]: Best sorted hypotheses + + """ + best_hyps = [] + part_ids = paddle.arange(self.n_vocab) # no pre-beam + for hyp in running_hyps: + # scoring + weighted_scores = paddle.zeros(self.n_vocab, dtype=x.dtype) + scores, states = self.score_full(hyp, x) + for k in self.full_scorers: + weighted_scores += self.weights[k] * scores[k] + # partial scoring + if self.do_pre_beam: + pre_beam_scores = ( + weighted_scores + if self.pre_beam_score_key == "full" + else scores[self.pre_beam_score_key] + ) + part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1] + part_scores, part_states = self.score_partial(hyp, part_ids, x) + for k in self.part_scorers: + weighted_scores[part_ids] += self.weights[k] * part_scores[k] + # add previous hyp score + weighted_scores += hyp.score + + # update hyps + for j, part_j in zip(*self.beam(weighted_scores, part_ids)): + # `part_j` is `j` relative id in `part_scores` + # will be (2 x beam at most) + best_hyps.append( + Hypothesis( + score=weighted_scores[j], + yseq=self.append_token(hyp.yseq, j), + scores=self.merge_scores( + hyp.scores, scores, j, part_scores, part_j + ), + states=self.merge_states(states, part_states, part_j), + ) + ) + + # sort and prune 2 x beam -> beam + best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[ + : min(len(best_hyps), self.beam_size) + ] + return best_hyps + + def forward( + self, x: paddle.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0 + ) -> List[Hypothesis]: + """Perform beam search. + + Args: + x (paddle.Tensor): Encoded speech feature (T, D) + maxlenratio (float): Input length ratio to obtain max output length. + If maxlenratio=0.0 (default), it uses a end-detect function + to automatically find maximum hypothesis lengths + If maxlenratio<0.0, its absolute value is interpreted + as a constant max output length. + minlenratio (float): Input length ratio to obtain min output length. + + Returns: + list[Hypothesis]: N-best decoding results + + """ + # set length bounds + if maxlenratio == 0: + maxlen = x.shape[0] + elif maxlenratio < 0: + maxlen = -1 * int(maxlenratio) + else: + maxlen = max(1, int(maxlenratio * x.size(0))) + minlen = int(minlenratio * x.size(0)) + logger.info("decoder input length: " + str(x.shape[0])) + logger.info("max output length: " + str(maxlen)) + logger.info("min output length: " + str(minlen)) + + # main loop of prefix search + running_hyps = self.init_hyp(x) + ended_hyps = [] + for i in range(maxlen): + logger.debug("position " + str(i)) + best = self.search(running_hyps, x) + # post process of one iteration + running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps) + # end detection + if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i): + logger.info(f"end detected at {i}") + break + if len(running_hyps) == 0: + logger.info("no hypothesis. Finish decoding.") + break + else: + logger.debug(f"remained hypotheses: {len(running_hyps)}") + + nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True) + # check the number of hypotheses reaching to eos + if len(nbest_hyps) == 0: + logger.warning( + "there is no N-best results, perform recognition " + "again with smaller minlenratio." + ) + return ( + [] + if minlenratio < 0.1 + else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1)) + ) + + # report the best result + best = nbest_hyps[0] + for k, v in best.scores.items(): + logger.info( + f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}" + ) + logger.info(f"total log probability: {best.score:.2f}") + logger.info(f"normalized log probability: {best.score / len(best.yseq):.2f}") + logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}") + if self.token_list is not None: + logger.info( + "best hypo: " + + "".join([self.token_list[x] for x in best.yseq[1:-1]]) + + "\n" + ) + return nbest_hyps + + def post_process( + self, + i: int, + maxlen: int, + maxlenratio: float, + running_hyps: List[Hypothesis], + ended_hyps: List[Hypothesis], + ) -> List[Hypothesis]: + """Perform post-processing of beam search iterations. + + Args: + i (int): The length of hypothesis tokens. + maxlen (int): The maximum length of tokens in beam search. + maxlenratio (int): The maximum length ratio in beam search. + running_hyps (List[Hypothesis]): The running hypotheses in beam search. + ended_hyps (List[Hypothesis]): The ended hypotheses in beam search. + + Returns: + List[Hypothesis]: The new running hypotheses. + + """ + logger.debug(f"the number of running hypotheses: {len(running_hyps)}") + if self.token_list is not None: + logger.debug( + "best hypo: " + + "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]]) + ) + # add eos in the final loop to avoid that there are no ended hyps + if i == maxlen - 1: + logger.info("adding in the last position in the loop") + running_hyps = [ + h._replace(yseq=self.append_token(h.yseq, self.eos)) + for h in running_hyps + ] + + # add ended hypotheses to a final list, and removed them from current hypotheses + # (this will be a problem, number of hyps < beam) + remained_hyps = [] + for hyp in running_hyps: + if hyp.yseq[-1] == self.eos: + # e.g., Word LM needs to add final score + for k, d in chain(self.full_scorers.items(), self.part_scorers.items()): + s = d.final_score(hyp.states[k]) + hyp.scores[k] += s + hyp = hyp._replace(score=hyp.score + self.weights[k] * s) + ended_hyps.append(hyp) + else: + remained_hyps.append(hyp) + return remained_hyps + + +def beam_search( + x: paddle.Tensor, + sos: int, + eos: int, + beam_size: int, + vocab_size: int, + scorers: Dict[str, ScorerInterface], + weights: Dict[str, float], + token_list: List[str] = None, + maxlenratio: float = 0.0, + minlenratio: float = 0.0, + pre_beam_ratio: float = 1.5, + pre_beam_score_key: str = "full", +) -> list: + """Perform beam search with scorers. + + Args: + x (paddle.Tensor): Encoded speech feature (T, D) + sos (int): Start of sequence id + eos (int): End of sequence id + beam_size (int): The number of hypotheses kept during search + vocab_size (int): The number of vocabulary + scorers (dict[str, ScorerInterface]): Dict of decoder modules + e.g., Decoder, CTCPrefixScorer, LM + The scorer will be ignored if it is `None` + weights (dict[str, float]): Dict of weights for each scorers + The scorer will be ignored if its weight is 0 + token_list (list[str]): List of tokens for debug log + maxlenratio (float): Input length ratio to obtain max output length. + If maxlenratio=0.0 (default), it uses a end-detect function + to automatically find maximum hypothesis lengths + minlenratio (float): Input length ratio to obtain min output length. + pre_beam_score_key (str): key of scores to perform pre-beam search + pre_beam_ratio (float): beam size in the pre-beam search + will be `int(pre_beam_ratio * beam_size)` + + Returns: + List[Dict]: N-best decoding results + + """ + ret = BeamSearch( + scorers, + weights, + beam_size=beam_size, + vocab_size=vocab_size, + pre_beam_ratio=pre_beam_ratio, + pre_beam_score_key=pre_beam_score_key, + sos=sos, + eos=eos, + token_list=token_list, + ).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio) + return [h.asdict() for h in ret] diff --git a/deepspeech/io/batchfy.py b/deepspeech/io/batchfy.py index de29d054..06cb3c9d 100644 --- a/deepspeech/io/batchfy.py +++ b/deepspeech/io/batchfy.py @@ -354,7 +354,7 @@ def make_batchset( :param int batch_frames_out: maximum number of output frames in a minibatch. :param int batch_frames_out: maximum number of input+output frames in a minibatch. :param str count: strategy to count maximum size of batch. - For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES + For choices, see io.batchfy.BATCH_COUNT_CHOICES :param int max_length_in: maximum length of input to decide adaptive batch size :param int max_length_out: maximum length of output to decide adaptive batch size From 26a4a46c417d7bfb26c49772964a9834c058281d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 21 Oct 2021 07:44:46 +0000 Subject: [PATCH 5/5] fix ctc scorer name --- deepspeech/decoders/scorers/ctc.py | 4 ++-- deepspeech/decoders/scorers/ctc_prefix_score.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deepspeech/decoders/scorers/ctc.py b/deepspeech/decoders/scorers/ctc.py index 4871d6e1..36b4bfd3 100644 --- a/deepspeech/decoders/scorers/ctc.py +++ b/deepspeech/decoders/scorers/ctc.py @@ -15,8 +15,8 @@ import numpy as np import paddle -from .ctc_prefix_score import CTCPrefixScore -from .ctc_prefix_score import CTCPrefixScorePD +from .ctc_prefix_score import CTCPrefixScorer +from .ctc_prefix_score import CTCPrefixScorerPD from .scorer_interface import BatchPartialScorerInterface diff --git a/deepspeech/decoders/scorers/ctc_prefix_score.py b/deepspeech/decoders/scorers/ctc_prefix_score.py index c85d546d..5f568c81 100644 --- a/deepspeech/decoders/scorers/ctc_prefix_score.py +++ b/deepspeech/decoders/scorers/ctc_prefix_score.py @@ -6,7 +6,7 @@ import paddle import six -class CTCPrefixScorePD(): +class CTCPrefixScorerPD(): """Batch processing of CTCPrefixScore which is based on Algorithm 2 in WATANABE et al. @@ -267,7 +267,7 @@ class CTCPrefixScorePD(): return (r_prev_new, s_prev, f_min_prev, f_max_prev) -class CTCPrefixScore(): +class CTCPrefixScorer(): """Compute CTC label sequence scores which is based on Algorithm 2 in WATANABE et al.