From 0a76751964742e32af26cf26d7e86767cdc42933 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 15 Oct 2021 02:25:57 +0000 Subject: [PATCH 01/17] rename scorers --- deepspeech/decoders/{scores => scorers}/__init__.py | 0 deepspeech/decoders/{scores => scorers}/ctc.py | 0 deepspeech/decoders/{scores => scorers}/ctc_prefix_score.py | 0 deepspeech/decoders/{scores => scorers}/length_bonus.py | 0 deepspeech/decoders/{scores => scorers}/ngram.py | 0 deepspeech/decoders/{scores => scorers}/score_interface.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename deepspeech/decoders/{scores => scorers}/__init__.py (100%) rename deepspeech/decoders/{scores => scorers}/ctc.py (100%) rename deepspeech/decoders/{scores => scorers}/ctc_prefix_score.py (100%) rename deepspeech/decoders/{scores => scorers}/length_bonus.py (100%) rename deepspeech/decoders/{scores => scorers}/ngram.py (100%) rename deepspeech/decoders/{scores => scorers}/score_interface.py (100%) diff --git a/deepspeech/decoders/scores/__init__.py b/deepspeech/decoders/scorers/__init__.py similarity index 100% rename from deepspeech/decoders/scores/__init__.py rename to deepspeech/decoders/scorers/__init__.py diff --git a/deepspeech/decoders/scores/ctc.py b/deepspeech/decoders/scorers/ctc.py similarity index 100% rename from deepspeech/decoders/scores/ctc.py rename to deepspeech/decoders/scorers/ctc.py diff --git a/deepspeech/decoders/scores/ctc_prefix_score.py b/deepspeech/decoders/scorers/ctc_prefix_score.py similarity index 100% rename from deepspeech/decoders/scores/ctc_prefix_score.py rename to deepspeech/decoders/scorers/ctc_prefix_score.py diff --git a/deepspeech/decoders/scores/length_bonus.py b/deepspeech/decoders/scorers/length_bonus.py similarity index 100% rename from deepspeech/decoders/scores/length_bonus.py rename to deepspeech/decoders/scorers/length_bonus.py diff --git a/deepspeech/decoders/scores/ngram.py b/deepspeech/decoders/scorers/ngram.py similarity index 100% rename from deepspeech/decoders/scores/ngram.py rename to deepspeech/decoders/scorers/ngram.py diff --git a/deepspeech/decoders/scores/score_interface.py b/deepspeech/decoders/scorers/score_interface.py similarity index 100% rename from deepspeech/decoders/scores/score_interface.py rename to deepspeech/decoders/scorers/score_interface.py From cb51c9c2dd25310c828a296b25f7b4b0c849448a Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 15 Oct 2021 02:26:15 +0000 Subject: [PATCH 02/17] add LayerDict --- deepspeech/__init__.py | 151 ++++++++++++++++++++++++++++++++++++ deepspeech/utils/utility.py | 2 +- 2 files changed, 152 insertions(+), 1 deletion(-) diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index 493f10a6f..5f9ba007e 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -355,6 +355,8 @@ if not hasattr(paddle.Tensor, 'tolist'): setattr(paddle.Tensor, 'tolist', tolist) + +########### hcak paddle.nn.functional ############# # hack loss def ctc_loss(logits, labels, @@ -381,3 +383,152 @@ logger.debug( "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!" ) F.ctc_loss = ctc_loss + + +########### hcak paddle.nn ############# +from paddle.nn import Layer +from typing import Optional +from typing import Mapping +from typing import Iterable +from typing import Tuple +from typing import Iterator +from collections import OrderedDict, abc as container_abcs + +class LayerDict(paddle.nn.Layer): + r"""Holds submodules in a dictionary. + + :class:`~paddle.nn.LayerDict` can be indexed like a regular Python dictionary, + but modules it contains are properly registered, and will be visible by all + :class:`~paddle.nn.Layer` methods. + + :class:`~paddle.nn.LayerDict` is an **ordered** dictionary that respects + + * the order of insertion, and + + * in :meth:`~paddle.nn.LayerDict.update`, the order of the merged + ``OrderedDict``, ``dict`` (started from Python 3.6) or another + :class:`~paddle.nn.LayerDict` (the argument to + :meth:`~paddle.nn.LayerDict.update`). + + Note that :meth:`~paddle.nn.LayerDict.update` with other unordered mapping + types (e.g., Python's plain ``dict`` before Python version 3.6) does not + preserve the order of the merged mapping. + + Args: + modules (iterable, optional): a mapping (dictionary) of (string: module) + or an iterable of key-value pairs of type (string, module) + + Example:: + + class MyModule(nn.Layer): + def __init__(self): + super(MyModule, self).__init__() + self.choices = nn.LayerDict({ + 'conv': nn.Conv2d(10, 10, 3), + 'pool': nn.MaxPool2d(3) + }) + self.activations = nn.LayerDict([ + ['lrelu', nn.LeakyReLU()], + ['prelu', nn.PReLU()] + ]) + + def forward(self, x, choice, act): + x = self.choices[choice](x) + x = self.activations[act](x) + return x + """ + + def __init__(self, modules: Optional[Mapping[str, Layer]] = None) -> None: + super(LayerDict, self).__init__() + if modules is not None: + self.update(modules) + + def __getitem__(self, key: str) -> Layer: + return self._modules[key] + + def __setitem__(self, key: str, module: Layer) -> None: + self.add_module(key, module) + + def __delitem__(self, key: str) -> None: + del self._modules[key] + + def __len__(self) -> int: + return len(self._modules) + + def __iter__(self) -> Iterator[str]: + return iter(self._modules) + + def __contains__(self, key: str) -> bool: + return key in self._modules + + def clear(self) -> None: + """Remove all items from the LayerDict. + """ + self._modules.clear() + + def pop(self, key: str) -> Layer: + r"""Remove key from the LayerDict and return its module. + + Args: + key (string): key to pop from the LayerDict + """ + v = self[key] + del self[key] + return v + + def keys(self) -> Iterable[str]: + r"""Return an iterable of the LayerDict keys. + """ + return self._modules.keys() + + def items(self) -> Iterable[Tuple[str, Layer]]: + r"""Return an iterable of the LayerDict key/value pairs. + """ + return self._modules.items() + + def values(self) -> Iterable[Layer]: + r"""Return an iterable of the LayerDict values. + """ + return self._modules.values() + + def update(self, modules: Mapping[str, Layer]) -> None: + r"""Update the :class:`~paddle.nn.LayerDict` with the key-value pairs from a + mapping or an iterable, overwriting existing keys. + + .. note:: + If :attr:`modules` is an ``OrderedDict``, a :class:`~paddle.nn.LayerDict`, or + an iterable of key-value pairs, the order of new elements in it is preserved. + + Args: + modules (iterable): a mapping (dictionary) from string to :class:`~paddle.nn.Layer`, + or an iterable of key-value pairs of type (string, :class:`~paddle.nn.Layer`) + """ + if not isinstance(modules, container_abcs.Iterable): + raise TypeError("LayerDict.update should be called with an " + "iterable of key/value pairs, but got " + + type(modules).__name__) + + if isinstance(modules, (OrderedDict, LayerDict, container_abcs.Mapping)): + for key, module in modules.items(): + self[key] = module + else: + # modules here can be a list with two items + for j, m in enumerate(modules): + if not isinstance(m, container_abcs.Iterable): + raise TypeError("LayerDict update sequence element " + "#" + str(j) + " should be Iterable; is" + + type(m).__name__) + if not len(m) == 2: + raise ValueError("LayerDict update sequence element " + "#" + str(j) + " has length " + str(len(m)) + + "; 2 is required") + # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)] + # that's too cumbersome to type correctly with overloads, so we add an ignore here + self[m[0]] = m[1] # type: ignore[assignment] + + # remove forward alltogether to fallback on Module's _forward_unimplemented + +if not hasattr(paddle.nn, 'LayerDict'): + logger.debug( + "register user LayerDict to paddle.nn, remove this when fixed!") + setattr(paddle.nn, 'LayerDict', LayerDict) diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py index ba5acbb96..8773b84ce 100644 --- a/deepspeech/utils/utility.py +++ b/deepspeech/utils/utility.py @@ -42,7 +42,7 @@ def all_version(): "paddle_commit": paddle.version.commit, "soundfile": soundfile.__version__, } - logger.info(f"Deps Module Version:{pformat(vers.items())}") + logger.info(f"Deps Module Version:{pformat(list(vers.items()))}") @contextmanager From 7af855bddccde2135b3f705e77bde7c8b61788ea Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 19 Oct 2021 06:40:46 +0000 Subject: [PATCH 03/17] fix score doc --- deepspeech/decoders/scorers/score_interface.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepspeech/decoders/scorers/score_interface.py b/deepspeech/decoders/scorers/score_interface.py index 3a9c500ba..366904a47 100644 --- a/deepspeech/decoders/scorers/score_interface.py +++ b/deepspeech/decoders/scorers/score_interface.py @@ -145,9 +145,11 @@ class PartialScorerInterface(ScorerInterface): and receives pre-pruned next tokens to score because it is too heavy to score all the tokens. + Score sub-set of tokens, not all. + Examples: * Prefix search for connectionist-temporal-classification models - * :class:`espnet.nets.scorers.ctc.CTCPrefixScorer` + * :class:`decoders.scorers.ctc.CTCPrefixScorer` """ From de79b3fad67c6c54d7b6a45a41da755e767a46bb Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 21 Oct 2021 07:40:41 +0000 Subject: [PATCH 04/17] add beam search with scorers --- deepspeech/decoders/beam_search.py | 528 +++++++++++++++++++++++++++++ deepspeech/io/batchfy.py | 2 +- 2 files changed, 529 insertions(+), 1 deletion(-) create mode 100644 deepspeech/decoders/beam_search.py diff --git a/deepspeech/decoders/beam_search.py b/deepspeech/decoders/beam_search.py new file mode 100644 index 000000000..5bf0d3e2f --- /dev/null +++ b/deepspeech/decoders/beam_search.py @@ -0,0 +1,528 @@ +"""Beam search module.""" + +from itertools import chain +import logger +from typing import Any +from typing import Dict +from typing import List +from typing import NamedTuple +from typing import Tuple +from typing import Union + +import paddle + +from .utils import end_detect +from .scorers.scorer_interface import PartialScorerInterface +from .scorers.scorer_interface import ScorerInterface + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +class Hypothesis(NamedTuple): + """Hypothesis data type.""" + + yseq: paddle.Tensor # (T,) + score: Union[float, paddle.Tensor] = 0 + scores: Dict[str, Union[float, paddle.Tensor]] = dict() + states: Dict[str, Any] = dict() + + def asdict(self) -> dict: + """Convert data to JSON-friendly dict.""" + return self._replace( + yseq=self.yseq.tolist(), + score=float(self.score), + scores={k: float(v) for k, v in self.scores.items()}, + )._asdict() + + +class BeamSearch(paddle.nn.Layer): + """Beam search implementation.""" + + def __init__( + self, + scorers: Dict[str, ScorerInterface], + weights: Dict[str, float], + beam_size: int, + vocab_size: int, + sos: int, + eos: int, + token_list: List[str] = None, + pre_beam_ratio: float = 1.5, + pre_beam_score_key: str = None, + ): + """Initialize beam search. + + Args: + scorers (dict[str, ScorerInterface]): Dict of decoder modules + e.g., Decoder, CTCPrefixScorer, LM + The scorer will be ignored if it is `None` + weights (dict[str, float]): Dict of weights for each scorers + The scorer will be ignored if its weight is 0 + beam_size (int): The number of hypotheses kept during search + vocab_size (int): The number of vocabulary + sos (int): Start of sequence id + eos (int): End of sequence id + token_list (list[str]): List of tokens for debug log + pre_beam_score_key (str): key of scores to perform pre-beam search + pre_beam_ratio (float): beam size in the pre-beam search + will be `int(pre_beam_ratio * beam_size)` + + """ + super().__init__() + # set scorers + self.weights = weights + self.scorers = dict() # all = full + partial + self.full_scorers = dict() # full tokens + self.part_scorers = dict() # partial tokens + # this module dict is required for recursive cast + # `self.to(device, dtype)` in `recog.py` + self.nn_dict = paddle.nn.LayerDict() # nn.Layer + for k, v in scorers.items(): + w = weights.get(k, 0) + if w == 0 or v is None: + continue + assert isinstance( + v, ScorerInterface + ), f"{k} ({type(v)}) does not implement ScorerInterface" + self.scorers[k] = v + if isinstance(v, PartialScorerInterface): + self.part_scorers[k] = v + else: + self.full_scorers[k] = v + if isinstance(v, paddle.nn.Layer): + self.nn_dict[k] = v + + # set configurations + self.sos = sos + self.eos = eos + self.token_list = token_list + # pre_beam_size > beam_size + self.pre_beam_size = int(pre_beam_ratio * beam_size) + self.beam_size = beam_size + self.n_vocab = vocab_size + if ( + pre_beam_score_key is not None + and pre_beam_score_key != "full" + and pre_beam_score_key not in self.full_scorers + ): + raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}") + # selected `key` scorer to do pre beam search + self.pre_beam_score_key = pre_beam_score_key + # do_pre_beam when need, valid and has part_scorers + self.do_pre_beam = ( + self.pre_beam_score_key is not None + and self.pre_beam_size < self.n_vocab + and len(self.part_scorers) > 0 + ) + + def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]: + """Get an initial hypothesis data. + + Args: + x (paddle.Tensor): The encoder output feature, (T, D) + + Returns: + Hypothesis: The initial hypothesis. + + """ + init_states = dict() + init_scores = dict() + for k, d in self.scorers.items(): + init_states[k] = d.init_state(x) + init_scores[k] = 0.0 + return [ + Hypothesis( + yseq=paddle.to_tensor([self.sos], place=x.place), + score=0.0, + scores=init_scores, + states=init_states, + ) + ] + + @staticmethod + def append_token(xs: paddle.Tensor, x: int) -> paddle.Tensor: + """Append new token to prefix tokens. + + Args: + xs (paddle.Tensor): The prefix token, (T,) + x (int): The new token to append + + Returns: + paddle.Tensor: (T+1,), New tensor contains: xs + [x] with xs.dtype and xs.device + + """ + x = paddle.to_tensor([x], dtype=xs.dtype, place=xs.place) + return paddle.cat((xs, x)) + + def score_full( + self, hyp: Hypothesis, x: paddle.Tensor + ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: + """Score new hypothesis by `self.full_scorers`. + + Args: + hyp (Hypothesis): Hypothesis with prefix tokens to score + x (paddle.Tensor): Corresponding input feature, (T, D) + + Returns: + Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of + score dict of `hyp` that has string keys of `self.full_scorers` + and tensor score values of shape: `(self.n_vocab,)`, + and state dict that has string keys + and state values of `self.full_scorers` + + """ + scores = dict() + states = dict() + for k, d in self.full_scorers.items(): + # scores[k] shape (self.n_vocab,) + scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x) + return scores, states + + def score_partial( + self, hyp: Hypothesis, ids: paddle.Tensor, x: paddle.Tensor + ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: + """Score new hypothesis by `self.part_scorers`. + + Args: + hyp (Hypothesis): Hypothesis with prefix tokens to score + ids (paddle.Tensor): 1D tensor of new partial tokens to score, + len(ids) < n_vocab + x (paddle.Tensor): Corresponding input feature, (T, D) + + Returns: + Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of + score dict of `hyp` that has string keys of `self.part_scorers` + and tensor score values of shape: `(len(ids),)`, + and state dict that has string keys + and state values of `self.part_scorers` + + """ + scores = dict() + states = dict() + for k, d in self.part_scorers.items(): + # scores[k] shape (len(ids),) + scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x) + return scores, states + + def beam( + self, weighted_scores: paddle.Tensor, ids: paddle.Tensor + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Compute topk full token ids and partial token ids. + + Args: + weighted_scores (paddle.Tensor): The weighted sum scores for each tokens. + Its shape is `(self.n_vocab,)`. + ids (paddle.Tensor): The partial token ids(Global) to compute topk. + + Returns: + Tuple[paddle.Tensor, paddle.Tensor]: + The topk full token ids and partial token ids. + Their shapes are `(self.beam_size,)`. + i.e. (global ids, global relative local ids). + + """ + # no pre beam performed, `ids` equal to `weighted_scores` + if weighted_scores.size(0) == ids.size(0): + top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab + return top_ids, top_ids + + # mask pruned in pre-beam not to select in topk + tmp = weighted_scores[ids] + weighted_scores[:] = -float("inf") + weighted_scores[ids] = tmp + # top_ids no equal to local_ids, since ids shape not same + top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab + local_ids = weighted_scores[ids].topk(self.beam_size)[1] # index in len(ids) + return top_ids, local_ids + + @staticmethod + def merge_scores( + prev_scores: Dict[str, float], + next_full_scores: Dict[str, paddle.Tensor], + full_idx: int, + next_part_scores: Dict[str, paddle.Tensor], + part_idx: int, + ) -> Dict[str, paddle.Tensor]: + """Merge scores for new hypothesis. + + Args: + prev_scores (Dict[str, float]): + The previous hypothesis scores by `self.scorers` + next_full_scores (Dict[str, paddle.Tensor]): scores by `self.full_scorers` + full_idx (int): The next token id for `next_full_scores` + next_part_scores (Dict[str, paddle.Tensor]): + scores of partial tokens by `self.part_scorers` + part_idx (int): The new token id for `next_part_scores` + + Returns: + Dict[str, paddle.Tensor]: The new score dict. + Its keys are names of `self.full_scorers` and `self.part_scorers`. + Its values are scalar tensors by the scorers. + + """ + new_scores = dict() + for k, v in next_full_scores.items(): + new_scores[k] = prev_scores[k] + v[full_idx] + for k, v in next_part_scores.items(): + new_scores[k] = prev_scores[k] + v[part_idx] + return new_scores + + def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any: + """Merge states for new hypothesis. + + Args: + states: states of `self.full_scorers` + part_states: states of `self.part_scorers` + part_idx (int): The new token id for `part_scores` + + Returns: + Dict[str, paddle.Tensor]: The new score dict. + Its keys are names of `self.full_scorers` and `self.part_scorers`. + Its values are states of the scorers. + + """ + new_states = dict() + for k, v in states.items(): + new_states[k] = v + for k, d in self.part_scorers.items(): + new_states[k] = d.select_state(part_states[k], part_idx) + return new_states + + def search( + self, running_hyps: List[Hypothesis], x: paddle.Tensor + ) -> List[Hypothesis]: + """Search new tokens for running hypotheses and encoded speech x. + + Args: + running_hyps (List[Hypothesis]): Running hypotheses on beam + x (paddle.Tensor): Encoded speech feature (T, D) + + Returns: + List[Hypotheses]: Best sorted hypotheses + + """ + best_hyps = [] + part_ids = paddle.arange(self.n_vocab) # no pre-beam + for hyp in running_hyps: + # scoring + weighted_scores = paddle.zeros(self.n_vocab, dtype=x.dtype) + scores, states = self.score_full(hyp, x) + for k in self.full_scorers: + weighted_scores += self.weights[k] * scores[k] + # partial scoring + if self.do_pre_beam: + pre_beam_scores = ( + weighted_scores + if self.pre_beam_score_key == "full" + else scores[self.pre_beam_score_key] + ) + part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1] + part_scores, part_states = self.score_partial(hyp, part_ids, x) + for k in self.part_scorers: + weighted_scores[part_ids] += self.weights[k] * part_scores[k] + # add previous hyp score + weighted_scores += hyp.score + + # update hyps + for j, part_j in zip(*self.beam(weighted_scores, part_ids)): + # `part_j` is `j` relative id in `part_scores` + # will be (2 x beam at most) + best_hyps.append( + Hypothesis( + score=weighted_scores[j], + yseq=self.append_token(hyp.yseq, j), + scores=self.merge_scores( + hyp.scores, scores, j, part_scores, part_j + ), + states=self.merge_states(states, part_states, part_j), + ) + ) + + # sort and prune 2 x beam -> beam + best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[ + : min(len(best_hyps), self.beam_size) + ] + return best_hyps + + def forward( + self, x: paddle.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0 + ) -> List[Hypothesis]: + """Perform beam search. + + Args: + x (paddle.Tensor): Encoded speech feature (T, D) + maxlenratio (float): Input length ratio to obtain max output length. + If maxlenratio=0.0 (default), it uses a end-detect function + to automatically find maximum hypothesis lengths + If maxlenratio<0.0, its absolute value is interpreted + as a constant max output length. + minlenratio (float): Input length ratio to obtain min output length. + + Returns: + list[Hypothesis]: N-best decoding results + + """ + # set length bounds + if maxlenratio == 0: + maxlen = x.shape[0] + elif maxlenratio < 0: + maxlen = -1 * int(maxlenratio) + else: + maxlen = max(1, int(maxlenratio * x.size(0))) + minlen = int(minlenratio * x.size(0)) + logger.info("decoder input length: " + str(x.shape[0])) + logger.info("max output length: " + str(maxlen)) + logger.info("min output length: " + str(minlen)) + + # main loop of prefix search + running_hyps = self.init_hyp(x) + ended_hyps = [] + for i in range(maxlen): + logger.debug("position " + str(i)) + best = self.search(running_hyps, x) + # post process of one iteration + running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps) + # end detection + if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i): + logger.info(f"end detected at {i}") + break + if len(running_hyps) == 0: + logger.info("no hypothesis. Finish decoding.") + break + else: + logger.debug(f"remained hypotheses: {len(running_hyps)}") + + nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True) + # check the number of hypotheses reaching to eos + if len(nbest_hyps) == 0: + logger.warning( + "there is no N-best results, perform recognition " + "again with smaller minlenratio." + ) + return ( + [] + if minlenratio < 0.1 + else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1)) + ) + + # report the best result + best = nbest_hyps[0] + for k, v in best.scores.items(): + logger.info( + f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}" + ) + logger.info(f"total log probability: {best.score:.2f}") + logger.info(f"normalized log probability: {best.score / len(best.yseq):.2f}") + logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}") + if self.token_list is not None: + logger.info( + "best hypo: " + + "".join([self.token_list[x] for x in best.yseq[1:-1]]) + + "\n" + ) + return nbest_hyps + + def post_process( + self, + i: int, + maxlen: int, + maxlenratio: float, + running_hyps: List[Hypothesis], + ended_hyps: List[Hypothesis], + ) -> List[Hypothesis]: + """Perform post-processing of beam search iterations. + + Args: + i (int): The length of hypothesis tokens. + maxlen (int): The maximum length of tokens in beam search. + maxlenratio (int): The maximum length ratio in beam search. + running_hyps (List[Hypothesis]): The running hypotheses in beam search. + ended_hyps (List[Hypothesis]): The ended hypotheses in beam search. + + Returns: + List[Hypothesis]: The new running hypotheses. + + """ + logger.debug(f"the number of running hypotheses: {len(running_hyps)}") + if self.token_list is not None: + logger.debug( + "best hypo: " + + "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]]) + ) + # add eos in the final loop to avoid that there are no ended hyps + if i == maxlen - 1: + logger.info("adding in the last position in the loop") + running_hyps = [ + h._replace(yseq=self.append_token(h.yseq, self.eos)) + for h in running_hyps + ] + + # add ended hypotheses to a final list, and removed them from current hypotheses + # (this will be a problem, number of hyps < beam) + remained_hyps = [] + for hyp in running_hyps: + if hyp.yseq[-1] == self.eos: + # e.g., Word LM needs to add final score + for k, d in chain(self.full_scorers.items(), self.part_scorers.items()): + s = d.final_score(hyp.states[k]) + hyp.scores[k] += s + hyp = hyp._replace(score=hyp.score + self.weights[k] * s) + ended_hyps.append(hyp) + else: + remained_hyps.append(hyp) + return remained_hyps + + +def beam_search( + x: paddle.Tensor, + sos: int, + eos: int, + beam_size: int, + vocab_size: int, + scorers: Dict[str, ScorerInterface], + weights: Dict[str, float], + token_list: List[str] = None, + maxlenratio: float = 0.0, + minlenratio: float = 0.0, + pre_beam_ratio: float = 1.5, + pre_beam_score_key: str = "full", +) -> list: + """Perform beam search with scorers. + + Args: + x (paddle.Tensor): Encoded speech feature (T, D) + sos (int): Start of sequence id + eos (int): End of sequence id + beam_size (int): The number of hypotheses kept during search + vocab_size (int): The number of vocabulary + scorers (dict[str, ScorerInterface]): Dict of decoder modules + e.g., Decoder, CTCPrefixScorer, LM + The scorer will be ignored if it is `None` + weights (dict[str, float]): Dict of weights for each scorers + The scorer will be ignored if its weight is 0 + token_list (list[str]): List of tokens for debug log + maxlenratio (float): Input length ratio to obtain max output length. + If maxlenratio=0.0 (default), it uses a end-detect function + to automatically find maximum hypothesis lengths + minlenratio (float): Input length ratio to obtain min output length. + pre_beam_score_key (str): key of scores to perform pre-beam search + pre_beam_ratio (float): beam size in the pre-beam search + will be `int(pre_beam_ratio * beam_size)` + + Returns: + List[Dict]: N-best decoding results + + """ + ret = BeamSearch( + scorers, + weights, + beam_size=beam_size, + vocab_size=vocab_size, + pre_beam_ratio=pre_beam_ratio, + pre_beam_score_key=pre_beam_score_key, + sos=sos, + eos=eos, + token_list=token_list, + ).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio) + return [h.asdict() for h in ret] diff --git a/deepspeech/io/batchfy.py b/deepspeech/io/batchfy.py index de29d0546..06cb3c9d4 100644 --- a/deepspeech/io/batchfy.py +++ b/deepspeech/io/batchfy.py @@ -354,7 +354,7 @@ def make_batchset( :param int batch_frames_out: maximum number of output frames in a minibatch. :param int batch_frames_out: maximum number of input+output frames in a minibatch. :param str count: strategy to count maximum size of batch. - For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES + For choices, see io.batchfy.BATCH_COUNT_CHOICES :param int max_length_in: maximum length of input to decide adaptive batch size :param int max_length_out: maximum length of output to decide adaptive batch size From ae9b90c344cf620f9d224d31f56f245ee87dbeb5 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 21 Oct 2021 07:44:46 +0000 Subject: [PATCH 05/17] fix ctc scorer name --- deepspeech/decoders/scorers/ctc.py | 4 ++-- deepspeech/decoders/scorers/ctc_prefix_score.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deepspeech/decoders/scorers/ctc.py b/deepspeech/decoders/scorers/ctc.py index 4871d6e12..36b4bfd36 100644 --- a/deepspeech/decoders/scorers/ctc.py +++ b/deepspeech/decoders/scorers/ctc.py @@ -15,8 +15,8 @@ import numpy as np import paddle -from .ctc_prefix_score import CTCPrefixScore -from .ctc_prefix_score import CTCPrefixScorePD +from .ctc_prefix_score import CTCPrefixScorer +from .ctc_prefix_score import CTCPrefixScorerPD from .scorer_interface import BatchPartialScorerInterface diff --git a/deepspeech/decoders/scorers/ctc_prefix_score.py b/deepspeech/decoders/scorers/ctc_prefix_score.py index c85d546d3..5f568c811 100644 --- a/deepspeech/decoders/scorers/ctc_prefix_score.py +++ b/deepspeech/decoders/scorers/ctc_prefix_score.py @@ -6,7 +6,7 @@ import paddle import six -class CTCPrefixScorePD(): +class CTCPrefixScorerPD(): """Batch processing of CTCPrefixScore which is based on Algorithm 2 in WATANABE et al. @@ -267,7 +267,7 @@ class CTCPrefixScorePD(): return (r_prev_new, s_prev, f_min_prev, f_max_prev) -class CTCPrefixScore(): +class CTCPrefixScorer(): """Compute CTC label sequence scores which is based on Algorithm 2 in WATANABE et al. From 0b3cea1df008ecdb7163631dfaebf0f04b5a8646 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 21 Oct 2021 17:28:00 +0800 Subject: [PATCH 06/17] fix img link; rsl format; --- docs/source/asr/deepspeech_architecture.md | 12 +++++++----- docs/source/tts/README.md | 5 +++-- examples/aishell/README.md | 9 +++++---- examples/librispeech/README.md | 7 +++---- examples/librispeech/s0/README.md | 8 -------- examples/other/punctuation_restoration/README.md | 3 ++- 6 files changed, 20 insertions(+), 24 deletions(-) diff --git a/docs/source/asr/deepspeech_architecture.md b/docs/source/asr/deepspeech_architecture.md index 5a6ca8867..be9471d93 100644 --- a/docs/source/asr/deepspeech_architecture.md +++ b/docs/source/asr/deepspeech_architecture.md @@ -14,10 +14,11 @@ In addition, the training process and the testing process are also introduced. The arcitecture of the model is shown in Fig.1.

- +
Fig.1 The Arcitecture of deepspeech2 online model

+ ### Data Preparation #### Vocabulary For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \ and \. The \ represents the blank label in CTC, the \ represents the unknown character and the \ represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \, \ and \. For both English and mandarin data, we set the default indexs that \=0, \=1 and \= last index. @@ -130,7 +131,7 @@ By using the command above, the training process can be started. There are 5 sta Using the command below, you can test the deepspeech2 online model. ``` bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml -``` + ``` The detail commands are: ``` conf_path=conf/deepspeech2_online.yaml @@ -152,7 +153,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test export ckpt avg_n CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 fi - ``` +``` After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph. @@ -161,12 +162,13 @@ The deepspeech2 offline model is similarity to the deepspeech2 online model. The The arcitecture of the model is shown in Fig.2.

- +
Fig.2 The Arcitecture of deepspeech2 offline model

+ For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model. The code of encoder and decoder for deepspeech2 offline model is in: @@ -182,7 +184,7 @@ For training and testing, the "model_type" and the "conf_path" must be set. # Training offline cd examples/aishell/s0 bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml -``` + ``` ``` # Testing offline cd examples/aishell/s0 diff --git a/docs/source/tts/README.md b/docs/source/tts/README.md index 87cac76ea..18283cb27 100644 --- a/docs/source/tts/README.md +++ b/docs/source/tts/README.md @@ -2,10 +2,11 @@ Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle dynamic graph and includes many influential TTS models.
-
+
-## News + +## News - Oct-12-2021, Refector examples code. - Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech). - Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech). diff --git a/examples/aishell/README.md b/examples/aishell/README.md index 5e5c5ca90..82ef91da9 100644 --- a/examples/aishell/README.md +++ b/examples/aishell/README.md @@ -5,7 +5,8 @@ ## Data -| Data Subset | Duration in Seconds | -| data/manifest.train | 1.23 ~ 14.53125 | -| data/manifest.dev | 1.645 ~ 12.533 | -| data/manifest.test | 1.859125 ~ 14.6999375 | +| Data Subset | Duration in Seconds | +| ------------------- | --------------------- | +| data/manifest.train | 1.23 ~ 14.53125 | +| data/manifest.dev | 1.645 ~ 12.533 | +| data/manifest.test | 1.859125 ~ 14.6999375 | diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 57f506a49..724590952 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -1,9 +1,8 @@ # ASR -* s0 is for deepspeech2 offline -* s1 is for transformer/conformer/U2 -* s2 is for transformer/conformer/U2 w/ kaldi feat -need install Kaldi +* s0 is for deepspeech2 offline +* s1 is for transformer/conformer/U2 +* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi ## Data | Data Subset | Duration in Seconds | diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/s0/README.md index 11bcf5f65..77f92a2b7 100644 --- a/examples/librispeech/s0/README.md +++ b/examples/librispeech/s0/README.md @@ -1,14 +1,6 @@ # LibriSpeech -## Data -| Data Subset | Duration in Seconds | -| --- | --- | -| data/manifest.train | 0.83s ~ 29.735s | -| data/manifest.dev | 1.065 ~ 35.155s | -| data/manifest.test-clean | 1.285s ~ 34.955s | - ## Deepspeech2 - | Model | Params | release | Config | Test set | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | | DeepSpeech2 | 42.96M | 2.2.0 | conf/deepspeech2.yaml + spec_aug | test-clean | 14.49190807 | 0.067283 | diff --git a/examples/other/punctuation_restoration/README.md b/examples/other/punctuation_restoration/README.md index 42ae0db3a..6393d8f5b 100644 --- a/examples/other/punctuation_restoration/README.md +++ b/examples/other/punctuation_restoration/README.md @@ -1,3 +1,4 @@ # Punctation Restoration -Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask] to do this task. +Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) to do this task. + From 4d5c26f846095ecd2a149cdf8dea2d92a88f151f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 21 Oct 2021 11:30:21 +0000 Subject: [PATCH 07/17] fix wget --- tools/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/Makefile b/tools/Makefile index a06074b37..8146f120a 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -6,7 +6,7 @@ CC ?= gcc # used for sph2pipe # CXX = clang++ # Uncomment these lines... # CC = clang # ...to build with Clang. -$(WGET) ?= wget --no-check-certificate +WGET ?= wget --no-check-certificate .PHONY: all clean From 71dacec44321c2072096446f1cf750119aca3ecb Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 21 Oct 2021 11:51:16 +0000 Subject: [PATCH 08/17] fix wget --- tools/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/Makefile b/tools/Makefile index 8146f120a..77b41a486 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -85,7 +85,7 @@ sctk-$(SCTK_GITHASH).tar.gz: if [ -d '$(DOWNLOAD_DIR)' ]; then \ cp -p '$(DOWNLOAD_DIR)/sctk-$(SCTK_GITHASH).tar.gz' .; \ else \ - $($(WGET)) -nv -T 10 -t 3 -O sctk-$(SCTK_GITHASH).tar.gz \ + $(WGET) -nv -T 10 -t 3 -O sctk-$(SCTK_GITHASH).tar.gz \ https://github.com/usnistgov/SCTK/archive/$(SCTK_GITHASH).tar.gz; \ fi From 82044aecc52367f9f6cd91323fbcd7c8414d14b6 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 22 Oct 2021 06:28:44 +0000 Subject: [PATCH 09/17] u2 kaldi wer4p0 --- deepspeech/__init__.py | 14 ++++++-- examples/librispeech/s2/README.md | 46 ++++++--------------------- examples/librispeech/s2/local/test.sh | 21 ++++++++---- examples/librispeech/s2/run.sh | 4 ++- 4 files changed, 38 insertions(+), 47 deletions(-) diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index 493f10a6f..f0cd1bae2 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -362,11 +362,19 @@ def ctc_loss(logits, label_lengths, blank=0, reduction='mean', - norm_by_times=True): + norm_by_times=False, + norm_by_batchsize=True, + norm_by_total_logits_len=False): #logger.info("my ctc loss with norm by times") ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403 - loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times, - input_lengths, label_lengths) + loss_out = paddle.fluid.layers.warpctc( + logits, + labels, + blank, + norm_by_times, + input_lengths, + label_lengths, + norm_by_batchsize, ) loss_out = paddle.fluid.layers.squeeze(loss_out, [-1]) assert reduction in ['mean', 'sum', 'none'] diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md index e4022f014..1f7c69194 100644 --- a/examples/librispeech/s2/README.md +++ b/examples/librispeech/s2/README.md @@ -1,41 +1,13 @@ # LibriSpeech -## Data -| Data Subset | Duration in Seconds | -| data/manifest.train | 0.83s ~ 29.735s | -| data/manifest.dev | 1.065 ~ 35.155s | -| data/manifest.test-clean | 1.285s ~ 34.955s | +| Model | Params | Config | Augmentation| Loss | +| --- | --- | --- | --- | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | 6.3197922706604 | -## Conformer -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | - | - | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | | | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | | | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | | | -### Test w/o length filter -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | | | - - -## Chunk Conformer - -| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | | | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | | | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | | - | -| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | | - | - - -## Transformer -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | | | - -### Test w/o length filter -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | | | +| Test Set | Decode Method | #Snt | #Wrd | Corr | Sub | Del | Ins | Err | S.Err | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| test-clean | attention | 2620 | 52576 | 96.4 | 2.5 | 1.1 | 0.4 | 4.0 | 34.7 | +| test-clean | ctc_greedy_search | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 48.0 | +| test-clean | ctc_prefix_beamsearch | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 47.6 | +| test-clean | attention_rescore | 2620 | 52576 | 96.8 | 2.9 | 0.3 | 0.4 | 3.7 | 38.0 | diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/s2/local/test.sh index 5eeb2d612..379a3787e 100755 --- a/examples/librispeech/s2/local/test.sh +++ b/examples/librispeech/s2/local/test.sh @@ -6,7 +6,7 @@ expdir=exp datadir=data nj=32 -lmtag= +lmtag='nolm' recog_set="test-clean test-other dev-clean dev-other" recog_set="test-clean" @@ -29,11 +29,18 @@ config_path=$1 dict=$2 ckpt_prefix=$3 + +ckpt_dir=$(dirname `dirname ${ckpt_prefix}`) +echo "ckpt dir: ${ckpt_dir}" + +ckpt_tag=$(basename ${ckpt_prefix}) +echo "ckpt tag: ${ckpt_tag}" + chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then chunk_mode=true fi -echo "chunk mode ${chunk_mode}" +echo "chunk mode: ${chunk_mode}" # download language model @@ -46,11 +53,13 @@ pids=() # initialize pids for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do ( + echo "decode method: ${dmethd}" for rtask in ${recog_set}; do ( - decode_dir=decode_${rtask}_${dmethd}_$(basename ${config_path%.*})_${lmtag} + echo "dataset: ${rtask}" + decode_dir=${ckpt_dir}/decode/decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag}_${ckpt_tag} feat_recog_dir=${datadir} - mkdir -p ${expdir}/${decode_dir} + mkdir -p ${decode_dir} mkdir -p ${feat_recog_dir} # split data @@ -61,7 +70,7 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco # set batchsize 0 to disable batch decoding batch_size=1 - ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \ + ${decode_cmd} JOB=1:${nj} ${decode_dir}/log/decode.JOB.log \ python3 -u ${BIN_DIR}/test.py \ --model-name u2_kaldi \ --run-mode test \ @@ -69,7 +78,7 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco --dict-path ${dict} \ --config ${config_path} \ --checkpoint_path ${ckpt_prefix} \ - --result-file ${expdir}/${decode_dir}/data.JOB.json \ + --result-file ${decode_dir}/data.JOB.json \ --opts decoding.decoding_method ${dmethd} \ --opts decoding.batch_size ${batch_size} \ --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh index 46b6ac1b4..1ffe3e5c5 100755 --- a/examples/librispeech/s2/run.sh +++ b/examples/librispeech/s2/run.sh @@ -1,4 +1,5 @@ #!/bin/bash + set -e . ./path.sh || exit 1; @@ -7,8 +8,9 @@ set -e stage=0 stop_stage=100 conf_path=conf/transformer.yaml -dict_path=data/train_960_unigram5000_units.txt +dict_path=data/bpe_unigram_5000_units.txt avg_num=10 + source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; avg_ckpt=avg_${avg_num} From 8d176c3f351aa07dea83578ae9431018e04cd0d4 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Fri, 22 Oct 2021 06:26:17 +0000 Subject: [PATCH 10/17] add hub for s1 in aishell and librispeech --- deepspeech/exps/deepspeech2/bin/test_hub.py | 8 +- deepspeech/exps/u2/bin/test_hub.py | 187 ++++++++++++++++++++ examples/aishell/s1/local/test_hub.sh | 47 +++++ examples/aishell/s1/run.sh | 7 + examples/librispeech/s0/local/test_hub.sh | 36 ++++ examples/librispeech/s0/run.sh | 6 + examples/librispeech/s1/local/test_hub.sh | 54 ++++++ examples/librispeech/s1/run.sh | 7 + 8 files changed, 347 insertions(+), 5 deletions(-) create mode 100644 deepspeech/exps/u2/bin/test_hub.py create mode 100755 examples/aishell/s1/local/test_hub.sh create mode 100755 examples/librispeech/s0/local/test_hub.sh create mode 100755 examples/librispeech/s1/local/test_hub.sh diff --git a/deepspeech/exps/deepspeech2/bin/test_hub.py b/deepspeech/exps/deepspeech2/bin/test_hub.py index 7d98e0b15..38760c285 100644 --- a/deepspeech/exps/deepspeech2/bin/test_hub.py +++ b/deepspeech/exps/deepspeech2/bin/test_hub.py @@ -58,8 +58,7 @@ class DeepSpeech2Tester_hub(): num_processes=cfg.num_proc_bsearch) #replace the '' with ' ' result_transcripts = [ - self._text_featurizer.detokenize(sentence) - for sentence in result_transcripts + sentence.replace("", " ") for sentence in result_transcripts ] return result_transcripts @@ -167,9 +166,8 @@ def check(audio_file): def main_sp(config, args): exp = DeepSpeech2Tester_hub(config, args) - with exp.eval(): - exp.setup() - exp.run_test() + exp.setup() + exp.run_test() def main(config, args): diff --git a/deepspeech/exps/u2/bin/test_hub.py b/deepspeech/exps/u2/bin/test_hub.py new file mode 100644 index 000000000..8c006ebea --- /dev/null +++ b/deepspeech/exps/u2/bin/test_hub.py @@ -0,0 +1,187 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation for U2 model.""" +import cProfile +import os +import sys + +import paddle +import soundfile + +from deepspeech.exps.u2.config import get_cfg_defaults +from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer +from deepspeech.io.collator import SpeechCollator +from deepspeech.models.u2 import U2Model +from deepspeech.training.cli import default_argument_parser +from deepspeech.training.trainer import Trainer +from deepspeech.utils import layer_tools +from deepspeech.utils import mp_tools +from deepspeech.utils.log import Log +from deepspeech.utils.utility import print_arguments +from deepspeech.utils.utility import UpdateConfig +logger = Log(__name__).getlog() + +# TODO(hui zhang): dynamic load + + +class U2Tester_Hub(Trainer): + def __init__(self, config, args): + # super().__init__(config, args) + self.args = args + self.config = config + self.audio_file = args.audio_file + self.collate_fn_test = SpeechCollator.from_config(config) + self._text_featurizer = TextFeaturizer( + unit_type=config.collator.unit_type, + vocab_filepath=None, + spm_model_prefix=config.collator.spm_model_prefix) + + def setup_model(self): + config = self.config + model_conf = config.model + + with UpdateConfig(model_conf): + model_conf.input_dim = self.collate_fn_test.feature_size + model_conf.output_dim = self.collate_fn_test.vocab_size + + model = U2Model.from_config(model_conf) + + if self.parallel: + model = paddle.DataParallel(model) + + logger.info(f"{model}") + layer_tools.print_params(model, logger.info) + + self.model = model + logger.info("Setup model") + + @mp_tools.rank_zero_only + @paddle.no_grad() + def test(self): + self.model.eval() + cfg = self.config.decoding + audio_file = self.audio_file + collate_fn_test = self.collate_fn_test + audio, _ = collate_fn_test.process_utterance( + audio_file=audio_file, transcript="Hello") + audio_len = audio.shape[0] + audio = paddle.to_tensor(audio, dtype='float32') + audio_len = paddle.to_tensor(audio_len) + audio = paddle.unsqueeze(audio, axis=0) + vocab_list = collate_fn_test.vocab_list + + text_feature = self.collate_fn_test.text_feature + result_transcripts = self.model.decode( + audio, + audio_len, + text_feature=text_feature, + decoding_method=cfg.decoding_method, + lang_model_path=cfg.lang_model_path, + beam_alpha=cfg.alpha, + beam_beta=cfg.beta, + beam_size=cfg.beam_size, + cutoff_prob=cfg.cutoff_prob, + cutoff_top_n=cfg.cutoff_top_n, + num_processes=cfg.num_proc_bsearch, + ctc_weight=cfg.ctc_weight, + decoding_chunk_size=cfg.decoding_chunk_size, + num_decoding_left_chunks=cfg.num_decoding_left_chunks, + simulate_streaming=cfg.simulate_streaming) + logger.info("The result_transcripts: " + result_transcripts[0][0]) + + def run_test(self): + self.resume() + try: + self.test() + except KeyboardInterrupt: + sys.exit(-1) + + def setup(self): + """Setup the experiment. + """ + paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu') + + #self.setup_output_dir() + #self.setup_checkpointer() + + #self.setup_dataloader() + self.setup_model() + + self.iteration = 0 + self.epoch = 0 + + def resume(self): + """Resume from the checkpoint at checkpoints in the output + directory or load a specified checkpoint. + """ + params_path = self.args.checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + self.model.set_state_dict(model_dict) + + +def check(audio_file): + logger.info("checking the audio file format......") + try: + sig, sample_rate = soundfile.read(audio_file) + except Exception as e: + logger.error(str(e)) + logger.error( + "can not open the wav file, please check the audio file format") + sys.exit(-1) + logger.info("The sample rate is %d" % sample_rate) + assert (sample_rate == 16000) + logger.info("The audio file format is right") + + +def main_sp(config, args): + exp = U2Tester_Hub(config, args) + with exp.eval(): + exp.setup() + exp.run_test() + + +def main(config, args): + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + # save asr result to + parser.add_argument( + "--result_file", type=str, help="path of save the asr result") + parser.add_argument( + "--audio_file", type=str, help="path of the input audio file") + args = parser.parse_args() + print_arguments(args, globals()) + + if not os.path.isfile(args.audio_file): + print("Please input the right audio file path") + sys.exit(-1) + check(args.audio_file) + # https://yaml.org/type/float.html + config = get_cfg_defaults() + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + # Setting for profiling + pr = cProfile.Profile() + pr.runcall(main, config, args) + pr.dump_stats('test.profile') diff --git a/examples/aishell/s1/local/test_hub.sh b/examples/aishell/s1/local/test_hub.sh new file mode 100755 index 000000000..99b141c81 --- /dev/null +++ b/examples/aishell/s1/local/test_hub.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: ${0} config_path ckpt_path_prefix audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 +audio_file=$3 + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + + + +for type in attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test_hub.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} \ + --audio_file ${audio_file} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done +exit 0 diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index 054fdd462..126c8e4e2 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -13,6 +13,8 @@ avg_ckpt=avg_${avg_num} ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') echo "checkpoint name ${ckpt}" +audio_file="data/tmp.wav" + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data bash ./local/data.sh || exit -1 @@ -48,3 +50,8 @@ fi # train lm and build TLG ./local/tlg.sh --corpus aishell --lmtype srilm fi + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 +fi diff --git a/examples/librispeech/s0/local/test_hub.sh b/examples/librispeech/s0/local/test_hub.sh new file mode 100755 index 000000000..2e32f24a0 --- /dev/null +++ b/examples/librispeech/s0/local/test_hub.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +if [ $# != 4 ];then + echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 +model_type=$3 +audio_file=$4 + +# download language model +bash local/download_lm_en.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +python3 -u ${BIN_DIR}/test_hub.py \ +--nproc ${ngpu} \ +--config ${config_path} \ +--result_file ${ckpt_prefix}.rsl \ +--checkpoint_path ${ckpt_prefix} \ +--model_type ${model_type} \ +--audio_file ${audio_file} + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/s0/run.sh index af47fb9b8..385a24a44 100755 --- a/examples/librispeech/s0/run.sh +++ b/examples/librispeech/s0/run.sh @@ -13,6 +13,7 @@ avg_ckpt=avg_${avg_num} ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') echo "checkpoint name ${ckpt}" +audio_file="data/tmp.flac" if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data bash ./local/data.sh || exit -1 @@ -37,3 +38,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type} fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 +fi diff --git a/examples/librispeech/s1/local/test_hub.sh b/examples/librispeech/s1/local/test_hub.sh new file mode 100755 index 000000000..dcf242e90 --- /dev/null +++ b/examples/librispeech/s1/local/test_hub.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: ${0} config_path ckpt_path_prefix audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 +audio_file=$3 + +# bpemode (unigram or bpe) +nbpe=5000 +bpemode=unigram +bpeprefix="data/bpe_${bpemode}_${nbpe}" +bpemodel=${bpeprefix}.model + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + + +for type in attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test_hub.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} \ + --audio_file ${audio_file} + + #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict} + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +exit 0 diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh index 4c8bf514e..9ea934301 100755 --- a/examples/librispeech/s1/run.sh +++ b/examples/librispeech/s1/run.sh @@ -15,6 +15,8 @@ avg_ckpt=avg_${avg_num} ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') echo "checkpoint name ${ckpt}" +audio_file="data/tmp.flac" + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data bash ./local/data.sh || exit -1 @@ -44,3 +46,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi + +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 +fi From 08bca964f89e90b0acc2ac8bb2cf541d02dd8e3d Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Fri, 22 Oct 2021 07:29:24 +0000 Subject: [PATCH 11/17] releace the with ' ' in ctcdecoder --- deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp | 3 ++- deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp | 3 ++- deepspeech/decoders/ctcdecoder/swig/decoder_utils.h | 1 + deepspeech/exps/deepspeech2/bin/test_hub.py | 4 ---- deepspeech/exps/deepspeech2/model.py | 5 ----- 5 files changed, 5 insertions(+), 11 deletions(-) diff --git a/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp b/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp index 18008cced..53a04fba0 100644 --- a/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp +++ b/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp @@ -53,7 +53,8 @@ std::string ctc_greedy_decoder( std::string best_path_result; for (size_t i = 0; i < idx_vec.size(); ++i) { if (idx_vec[i] != blank_id) { - best_path_result += vocabulary[idx_vec[i]]; + std::string ch = vocabulary[idx_vec[i]]; + best_path_result += (ch == kSPACE) ? tSPACE : ch; } } return best_path_result; diff --git a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp index 43f62e753..5d69ad032 100644 --- a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp +++ b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp @@ -74,7 +74,8 @@ std::vector> get_beam_search_result( // convert index to string std::string output_str; for (size_t j = 0; j < output.size(); j++) { - output_str += vocabulary[output[j]]; + std::string ch = vocabulary[output[j]]; + output_str += (ch == kSPACE) ? tSPACE : ch; } std::pair output_pair( -space_prefixes[i]->approx_ctc, output_str); diff --git a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h index 96399c778..1d75d03db 100644 --- a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h +++ b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h @@ -21,6 +21,7 @@ #include "path_trie.h" const std::string kSPACE = ""; +const std::string tSPACE = " "; const float NUM_FLT_INF = std::numeric_limits::max(); const float NUM_FLT_MIN = std::numeric_limits::min(); diff --git a/deepspeech/exps/deepspeech2/bin/test_hub.py b/deepspeech/exps/deepspeech2/bin/test_hub.py index 38760c285..1203991de 100644 --- a/deepspeech/exps/deepspeech2/bin/test_hub.py +++ b/deepspeech/exps/deepspeech2/bin/test_hub.py @@ -56,10 +56,6 @@ class DeepSpeech2Tester_hub(): cutoff_prob=cfg.cutoff_prob, cutoff_top_n=cfg.cutoff_top_n, num_processes=cfg.num_proc_bsearch) - #replace the '' with ' ' - result_transcripts = [ - sentence.replace("", " ") for sentence in result_transcripts - ] return result_transcripts diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 92320dac8..7b929f8b7 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -341,11 +341,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): cutoff_prob=cfg.cutoff_prob, cutoff_top_n=cfg.cutoff_top_n, num_processes=cfg.num_proc_bsearch) - #replace the with ' ' - result_transcripts = [ - self._text_featurizer.detokenize(sentence) - for sentence in result_transcripts - ] self.autolog.times.stamp() self.autolog.times.stamp() From 1d81d577b1f48ca294338a9591d9dc71e6c7724e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 22 Oct 2021 08:34:59 +0000 Subject: [PATCH 12/17] add recog interface --- deepspeech/decoders/recog.py | 154 +++++++++++++++++++++++++++++ deepspeech/decoders/utils.py | 75 ++++++++++++++ deepspeech/models/asr_interface.py | 148 +++++++++++++++++++++++++++ deepspeech/models/u2/u2.py | 28 +++++- deepspeech/modules/decoder.py | 69 ++++++++++++- deepspeech/modules/mask.py | 16 ++- 6 files changed, 484 insertions(+), 6 deletions(-) create mode 100644 deepspeech/decoders/recog.py create mode 100644 deepspeech/models/asr_interface.py diff --git a/deepspeech/decoders/recog.py b/deepspeech/decoders/recog.py new file mode 100644 index 000000000..399c5c54f --- /dev/null +++ b/deepspeech/decoders/recog.py @@ -0,0 +1,154 @@ +"""V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`.""" + +import json +import paddle + +# from espnet.asr.asr_utils import get_model_conf +# from espnet.asr.asr_utils import torch_load +# from espnet.asr.pytorch_backend.asr import load_trained_model +# from espnet.nets.lm_interface import dynamic_import_lm + +# from espnet.nets.asr_interface import ASRInterface + +from .utils import add_results_to_json +# from .batch_beam_search import BatchBeamSearch +from .beam_search import BeamSearch +from .scorer_interface import BatchScorerInterface +from .scorers.length_bonus import LengthBonus + +from deepspeech.io.reader import LoadInputsAndTargets +from deepspeech.utils.log import Log +logger = Log(__name__).getlog() + + +def recog_v2(args): + """Decode with custom models that implements ScorerInterface. + + Args: + args (namespace): The program arguments. + See py:func:`bin.asr_recog.get_parser` for details + + """ + logger.warning("experimental API for custom LMs is selected by --api v2") + if args.batchsize > 1: + raise NotImplementedError("multi-utt batch decoding is not implemented") + if args.streaming_mode is not None: + raise NotImplementedError("streaming mode is not implemented") + if args.word_rnnlm: + raise NotImplementedError("word LM is not implemented") + + # set_deterministic(args) + model, train_args = load_trained_model(args.model) + # assert isinstance(model, ASRInterface) + model.eval() + load_inputs_and_targets = LoadInputsAndTargets( + mode="asr", + load_output=False, + sort_in_input_length=False, + preprocess_conf=train_args.preprocess_conf + if args.preprocess_conf is None + else args.preprocess_conf, + preprocess_args={"train": False}, + ) + + if args.rnnlm: + lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) + # NOTE: for a compatibility with less than 0.5.0 version models + lm_model_module = getattr(lm_args, "model_module", "default") + lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) + lm = lm_class(len(train_args.char_list), lm_args) + torch_load(args.rnnlm, lm) + lm.eval() + else: + lm = None + + if args.ngram_model: + from .scorers.ngram import NgramFullScorer + from .scorers.ngram import NgramPartScorer + + if args.ngram_scorer == "full": + ngram = NgramFullScorer(args.ngram_model, train_args.char_list) + else: + ngram = NgramPartScorer(args.ngram_model, train_args.char_list) + else: + ngram = None + + scorers = model.scorers() + scorers["lm"] = lm + scorers["ngram"] = ngram + scorers["length_bonus"] = LengthBonus(len(train_args.char_list)) + weights = dict( + decoder=1.0 - args.ctc_weight, + ctc=args.ctc_weight, + lm=args.lm_weight, + ngram=args.ngram_weight, + length_bonus=args.penalty, + ) + beam_search = BeamSearch( + beam_size=args.beam_size, + vocab_size=len(train_args.char_list), + weights=weights, + scorers=scorers, + sos=model.sos, + eos=model.eos, + token_list=train_args.char_list, + pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", + ) + # TODO(karita): make all scorers batchfied + if args.batchsize == 1: + non_batch = [ + k + for k, v in beam_search.full_scorers.items() + if not isinstance(v, BatchScorerInterface) + ] + if len(non_batch) == 0: + beam_search.__class__ = BatchBeamSearch + logger.info("BatchBeamSearch implementation is selected.") + else: + logger.warning( + f"As non-batch scorers {non_batch} are found, " + f"fall back to non-batch implementation." + ) + + if args.ngpu > 1: + raise NotImplementedError("only single GPU decoding is supported") + if args.ngpu == 1: + device = "gpu:0" + else: + device = "cpu" + dtype = getattr(paddle, args.dtype) + logger.info(f"Decoding device={device}, dtype={dtype}") + model.to(device=device, dtype=dtype) + model.eval() + beam_search.to(device=device, dtype=dtype) + beam_search.eval() + + # read json data + with open(args.recog_json, "rb") as f: + js = json.load(f) + # josnlines to dict, key by 'utt' + js = {item['utt']: item for item in js} + + new_js = {} + with paddle.no_grad(): + for idx, name in enumerate(js.keys(), 1): + logger.info("(%d/%d) decoding " + name, idx, len(js.keys())) + batch = [(name, js[name])] + feat = load_inputs_and_targets(batch)[0][0] + enc = model.encode(paddle.to_tensor(feat).to(device=device, dtype=dtype)) + nbest_hyps = beam_search( + x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio + ) + nbest_hyps = [ + h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)] + ] + new_js[name] = add_results_to_json( + js[name], nbest_hyps, train_args.char_list + ) + + with open(args.result_label, "wb") as f: + f.write( + json.dumps( + {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True + ).encode("utf_8") + ) diff --git a/deepspeech/decoders/utils.py b/deepspeech/decoders/utils.py index 92f65814d..0281a78bb 100644 --- a/deepspeech/decoders/utils.py +++ b/deepspeech/decoders/utils.py @@ -47,3 +47,78 @@ def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))): return True else: return False + + +# * ------------------ recognition related ------------------ * +def parse_hypothesis(hyp, char_list): + """Parse hypothesis. + + Args: + hyp (list[dict[str, Any]]): Recognition hypothesis. + char_list (list[str]): List of characters. + + Returns: + tuple(str, str, str, float) + + """ + # remove sos and get results + tokenid_as_list = list(map(int, hyp["yseq"][1:])) + token_as_list = [char_list[idx] for idx in tokenid_as_list] + score = float(hyp["score"]) + + # convert to string + tokenid = " ".join([str(idx) for idx in tokenid_as_list]) + token = " ".join(token_as_list) + text = "".join(token_as_list).replace("", " ") + + return text, token, tokenid, score + + +def add_results_to_json(js, nbest_hyps, char_list): + """Add N-best results to json. + + Args: + js (dict[str, Any]): Groundtruth utterance dict. + nbest_hyps_sd (list[dict[str, Any]]): + List of hypothesis for multi_speakers: nutts x nspkrs. + char_list (list[str]): List of characters. + + Returns: + dict[str, Any]: N-best results added utterance dict. + + """ + # copy old json info + new_js = dict() + new_js["utt2spk"] = js["utt2spk"] + new_js["output"] = [] + + for n, hyp in enumerate(nbest_hyps, 1): + # parse hypothesis + rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list) + + # copy ground-truth + if len(js["output"]) > 0: + out_dic = dict(js["output"][0].items()) + else: + # for no reference case (e.g., speech translation) + out_dic = {"name": ""} + + # update name + out_dic["name"] += "[%d]" % n + + # add recognition results + out_dic["rec_text"] = rec_text + out_dic["rec_token"] = rec_token + out_dic["rec_tokenid"] = rec_tokenid + out_dic["score"] = score + + # add to list of N-best result dicts + new_js["output"].append(out_dic) + + # show 1-best result + if n == 1: + if "text" in out_dic.keys(): + logging.info("groundtruth: %s" % out_dic["text"]) + logging.info("prediction : %s" % out_dic["rec_text"]) + + return new_js \ No newline at end of file diff --git a/deepspeech/models/asr_interface.py b/deepspeech/models/asr_interface.py new file mode 100644 index 000000000..eb820fc05 --- /dev/null +++ b/deepspeech/models/asr_interface.py @@ -0,0 +1,148 @@ +"""ASR Interface module.""" +import argparse + +from deepspeech.utils.dynamic_import import dynamic_import + + +class ASRInterface: + """ASR Interface for ESPnet model implementation.""" + + @staticmethod + def add_arguments(parser): + """Add arguments to parser.""" + return parser + + @classmethod + def build(cls, idim: int, odim: int, **kwargs): + """Initialize this class with python-level args. + + Args: + idim (int): The number of an input feature dim. + odim (int): The number of output vocab. + + Returns: + ASRinterface: A new instance of ASRInterface. + + """ + args = argparse.Namespace(**kwargs) + return cls(idim, odim, args) + + def forward(self, xs, ilens, ys, olens): + """Compute loss for training. + + :param xs: batch of padded source sequences paddle.Tensor (B, Tmax, idim) + :param ilens: batch of lengths of source sequences (B), paddle.Tensor + :param ys: batch of padded target sequences paddle.Tensor (B, Lmax) + :param olens: batch of lengths of target sequences (B), paddle.Tensor + :return: loss value + :rtype: paddle.Tensor + """ + raise NotImplementedError("forward method is not implemented") + + def recognize(self, x, recog_args, char_list=None, rnnlm=None): + """Recognize x for evaluation. + + :param ndarray x: input acouctic feature (B, T, D) or (T, D) + :param namespace recog_args: argment namespace contraining options + :param list char_list: list of characters + :param paddle.nn.Layer rnnlm: language model module + :return: N-best decoding results + :rtype: list + """ + raise NotImplementedError("recognize method is not implemented") + + def recognize_batch(self, x, recog_args, char_list=None, rnnlm=None): + """Beam search implementation for batch. + + :param paddle.Tensor x: encoder hidden state sequences (B, Tmax, Henc) + :param namespace recog_args: argument namespace containing options + :param list char_list: list of characters + :param paddle.nn.Module rnnlm: language model module + :return: N-best decoding results + :rtype: list + """ + raise NotImplementedError("Batch decoding is not supported yet.") + + def calculate_all_attentions(self, xs, ilens, ys): + """Calculate attention. + + :param list xs: list of padded input sequences [(T1, idim), (T2, idim), ...] + :param ndarray ilens: batch of lengths of input sequences (B) + :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...] + :return: attention weights (B, Lmax, Tmax) + :rtype: float ndarray + """ + raise NotImplementedError("calculate_all_attentions method is not implemented") + + def calculate_all_ctc_probs(self, xs, ilens, ys): + """Calculate CTC probability. + + :param list xs_pad: list of padded input sequences [(T1, idim), (T2, idim), ...] + :param ndarray ilens: batch of lengths of input sequences (B) + :param list ys: list of character id sequence tensor [(L1), (L2), (L3), ...] + :return: CTC probabilities (B, Tmax, vocab) + :rtype: float ndarray + """ + raise NotImplementedError("calculate_all_ctc_probs method is not implemented") + + @property + def attention_plot_class(self): + """Get attention plot class.""" + from espnet.asr.asr_utils import PlotAttentionReport + + return PlotAttentionReport + + @property + def ctc_plot_class(self): + """Get CTC plot class.""" + from espnet.asr.asr_utils import PlotCTCReport + + return PlotCTCReport + + def get_total_subsampling_factor(self): + """Get total subsampling factor.""" + raise NotImplementedError( + "get_total_subsampling_factor method is not implemented" + ) + + def encode(self, feat): + """Encode feature in `beam_search` (optional). + + Args: + x (numpy.ndarray): input feature (T, D) + Returns: + paddle.Tensor: encoded feature (T, D) + """ + raise NotImplementedError("encode method is not implemented") + + def scorers(self): + """Get scorers for `beam_search` (optional). + + Returns: + dict[str, ScorerInterface]: dict of `ScorerInterface` objects + + """ + raise NotImplementedError("decoders method is not implemented") + + +predefined_asr = { + "transformer": "deepspeech.models.u2:E2E", + "conformer": "deepspeech.models.u2:E2E", +} + +def dynamic_import_asr(module, name): + """Import ASR models dynamically. + + Args: + module (str): module_name:class_name or alias in `predefined_asr` + name (str): asr name. e.g., transformer, conformer + + Returns: + type: ASR class + + """ + model_class = dynamic_import(module, predefined_asr.get(name, "")) + assert issubclass( + model_class, ASRInterface + ), f"{module} does not implement ASRInterface" + return model_class diff --git a/deepspeech/models/u2/u2.py b/deepspeech/models/u2/u2.py index fd63fa9c5..8915cbd7d 100644 --- a/deepspeech/models/u2/u2.py +++ b/deepspeech/models/u2/u2.py @@ -49,13 +49,15 @@ from deepspeech.utils.tensor_utils import pad_sequence from deepspeech.utils.tensor_utils import th_accuracy from deepspeech.utils.utility import log_add from deepspeech.utils.utility import UpdateConfig +from deepspeech.models.asr_interface import ASRInterface +from deepspeech.decoders.scorers.ctc_prefix_score import CTCPrefixScorer __all__ = ["U2Model", "U2InferModel"] logger = Log(__name__).getlog() -class U2BaseModel(nn.Layer): +class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @classmethod @@ -120,7 +122,7 @@ class U2BaseModel(nn.Layer): **kwargs): assert 0.0 <= ctc_weight <= 1.0, ctc_weight - super().__init__() + nn.Layer.__init__(self) # note that eos is the same as sos (equivalent ID) self.sos = vocab_size - 1 self.eos = vocab_size - 1 @@ -813,7 +815,27 @@ class U2BaseModel(nn.Layer): return res, res_tokenids -class U2Model(U2BaseModel): +class U2DecodeModel(U2BaseModel): + + def scorers(self): + """Scorers.""" + return dict(decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos)) + + def encode(self, x): + """Encode acoustic features. + + :param ndarray x: source acoustic feature (T, D) + :return: encoder outputs + :rtype: paddle.Tensor + """ + self.eval() + x = paddle.to_tensor(x).unsqueeze(0) + ilen = x.size(1) + enc_output, _ = self._forward_encoder(x, ilen) + return enc_output.squeeze(0) + + +class U2Model(U2DecodeModel): def __init__(self, configs: dict): vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs) diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py index 1ae3ce371..154b7390f 100644 --- a/deepspeech/modules/decoder.py +++ b/deepspeech/modules/decoder.py @@ -15,6 +15,7 @@ from typing import List from typing import Optional from typing import Tuple +from typing import Any import paddle from paddle import nn @@ -25,7 +26,9 @@ from deepspeech.modules.decoder_layer import DecoderLayer from deepspeech.modules.embedding import PositionalEncoding from deepspeech.modules.mask import make_non_pad_mask from deepspeech.modules.mask import subsequent_mask +from deepspeech.modules.mask import make_xs_mask from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward +from deepspeech.decoders.scorers.score_interface import BatchScorerInterface from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -33,7 +36,7 @@ logger = Log(__name__).getlog() __all__ = ["TransformerDecoder"] -class TransformerDecoder(nn.Layer): +class TransformerDecoder(BatchScorerInterface, nn.Layer): """Base class of Transfomer decoder module. Args: vocab_size: output dim @@ -71,7 +74,8 @@ class TransformerDecoder(nn.Layer): concat_after: bool=False, ): assert check_argument_types() - super().__init__() + nn.Layer.__init__(self) + self.selfattention_layer_type = 'selfattn' attention_dim = encoder_output_size if input_layer == "embed": @@ -180,3 +184,64 @@ class TransformerDecoder(nn.Layer): if self.use_output_layer: y = paddle.log_softmax(self.output_layer(y), axis=-1) return y, new_cache + + # beam search API (see ScorerInterface) + def score(self, ys, state, x): + """Score. + ys: (ylen,) + x: (xlen, n_feat) + """ + ys_mask = subsequent_mask(len(ys)).unsqueeze(0) + x_mask = make_xs_mask(x.unsqueeze(0)) + if self.selfattention_layer_type != "selfattn": + # TODO(karita): implement cache + logging.warning( + f"{self.selfattention_layer_type} does not support cached decoding." + ) + state = None + logp, state = self.forward_one_step( + x.unsqueeze(0), x_mask, + ys.unsqueeze(0), ys_mask, + cache=state + ) + return logp.squeeze(0), state + + # batch beam search API (see BatchScorerInterface) + def batch_score( + self, ys: paddle.Tensor, states: List[Any], xs: paddle.Tensor + ) -> Tuple[paddle.Tensor, List[Any]]: + """Score new token batch (required). + + Args: + ys (paddle.Tensor): paddle.int64 prefix tokens (n_batch, ylen). + states (List[Any]): Scorer states for prefix tokens. + xs (paddle.Tensor): + The encoder feature that generates ys (n_batch, xlen, n_feat). + + Returns: + tuple[paddle.Tensor, List[Any]]: Tuple of + batchfied scores for next token with shape of `(n_batch, n_vocab)` + and next state list for ys. + + """ + # merge states + n_batch = len(ys) + n_layers = len(self.decoders) + if states[0] is None: + batch_state = None + else: + # transpose state of [batch, layer] into [layer, batch] + batch_state = [ + paddle.stack([states[b][i] for b in range(n_batch)]) + for i in range(n_layers) + ] + + # batch decoding + ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0) + + xs_mask = make_xs_mask(xs) + logp, states = self.forward_one_step(xs, xs_mask, ys, ys_mask, cache=batch_state) + + # transpose state of [layer, batch] into [batch, layer] + state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] + return logp, state_list diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py index 00f228a2b..cffa10a7b 100644 --- a/deepspeech/modules/mask.py +++ b/deepspeech/modules/mask.py @@ -18,12 +18,24 @@ from deepspeech.utils.log import Log logger = Log(__name__).getlog() __all__ = [ - "make_pad_mask", "make_non_pad_mask", "subsequent_mask", + "make_xs_mask", "make_pad_mask", "make_non_pad_mask", "subsequent_mask", "subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores", "mask_finished_preds" ] +def make_xs_mask(xs:paddle.Tensor) -> paddle.Tensor: + """Maks mask tensor containing indices of non-padded part. + Args: + xs (paddle.Tensor): (B, T, D), zeros for pad. + Returns: + paddle.Tensor: Mask Tensor indices of non-padded part. (B, T, D) + """ + pad_frame = paddle.zeros([1, 1, xs.shape[-1]], dtype=xs.dtype) + mask = xs != pad_frame + return mask + + def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: """Make mask tensor containing indices of padded part. See description of make_non_pad_mask. @@ -31,6 +43,7 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: lengths (paddle.Tensor): Batch of lengths (B,). Returns: paddle.Tensor: Mask tensor containing indices of padded part. + (B, T) Examples: >>> lengths = [5, 3, 2] >>> make_pad_mask(lengths) @@ -62,6 +75,7 @@ def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: lengths (paddle.Tensor): Batch of lengths (B,). Returns: paddle.Tensor: mask tensor containing indices of padded part. + (B, T) Examples: >>> lengths = [5, 3, 2] >>> make_non_pad_mask(lengths) From b1f7eca4d97cc9af1202cf91fb37f5607fe42921 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 23 Oct 2021 04:13:04 +0000 Subject: [PATCH 13/17] not change ctc grad manual --- deepspeech/__init__.py | 34 ++----------------- deepspeech/modules/ctc.py | 5 +-- deepspeech/modules/loss.py | 10 +++--- examples/librispeech/s1/conf/transformer.yaml | 2 +- 4 files changed, 11 insertions(+), 40 deletions(-) diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index 5f9ba007e..6ed1177a5 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -355,37 +355,7 @@ if not hasattr(paddle.Tensor, 'tolist'): setattr(paddle.Tensor, 'tolist', tolist) - -########### hcak paddle.nn.functional ############# -# hack loss -def ctc_loss(logits, - labels, - input_lengths, - label_lengths, - blank=0, - reduction='mean', - norm_by_times=True): - #logger.info("my ctc loss with norm by times") - ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403 - loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times, - input_lengths, label_lengths) - - loss_out = paddle.fluid.layers.squeeze(loss_out, [-1]) - assert reduction in ['mean', 'sum', 'none'] - if reduction == 'mean': - loss_out = paddle.mean(loss_out / label_lengths) - elif reduction == 'sum': - loss_out = paddle.sum(loss_out) - return loss_out - - -logger.debug( - "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!" -) -F.ctc_loss = ctc_loss - - -########### hcak paddle.nn ############# +########### hack paddle.nn ############# from paddle.nn import Layer from typing import Optional from typing import Mapping @@ -532,3 +502,5 @@ if not hasattr(paddle.nn, 'LayerDict'): logger.debug( "register user LayerDict to paddle.nn, remove this when fixed!") setattr(paddle.nn, 'LayerDict', LayerDict) + + diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py index 565a11e15..e0c8006d1 100644 --- a/deepspeech/modules/ctc.py +++ b/deepspeech/modules/ctc.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle from paddle import nn +from typing import Union from paddle.nn import functional as F from typeguard import check_argument_types @@ -40,7 +41,7 @@ class CTCDecoderBase(nn.Layer): dropout_rate: float=0.0, reduction: bool=True, batch_average: bool=True, - grad_norm_type: str="instance"): + grad_norm_type: Union[str, None]=None): """CTC decoder Args: @@ -49,7 +50,7 @@ class CTCDecoderBase(nn.Layer): dropout_rate (float): dropout rate (0.0 ~ 1.0) reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none' batch_average (bool): do batch dim wise average. - grad_norm_type (str): one of 'instance', 'batch', 'frame', None. + grad_norm_type (str): Default, None. one of 'instance', 'batch', 'frame', None. """ assert check_argument_types() super().__init__() diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index e06f26f81..e11388107 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -54,7 +54,7 @@ class CTCLoss(nn.Layer): self.norm_by_total_logits_len = True else: raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}") - self.kwargs = { + kwargs = { "norm_by_times": self.norm_by_times, "norm_by_batchsize": self.norm_by_batchsize, "norm_by_total_logits_len": self.norm_by_total_logits_len, @@ -66,10 +66,9 @@ class CTCLoss(nn.Layer): except ValueError: # Some function, e.g. built-in function, are failed param = {} - self._kwargs = {k: v for k, v in self.kwargs.items() if k in param} - _notin = {k: v for k, v in self.kwargs.items() if k not in param} + self._kwargs = {k: v for k, v in kwargs.items() if k in param} + _notin = {k: v for k, v in kwargs.items() if k not in param} logger.info(f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}") - #self.loss_fn = partial(self.loss.forward, **_kwargs) def forward(self, logits, ys_pad, hlens, ys_lens): """Compute CTC loss. @@ -89,8 +88,7 @@ class CTCLoss(nn.Layer): # logits: (B, L, D) -> (L, B, D) logits = logits.transpose([1, 0, 2]) ys_pad = ys_pad.astype(paddle.int32) - #loss = self.loss_fn(logits, ys_pad, hlens, ys_lens) - loss = self.loss(logits, ys_pad, hlens, ys_lens) + loss = self.loss(logits, ys_pad, hlens, ys_lens, **self._kwargs) if self.batch_average: # Batch-size average loss = loss / B diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index c9dc1413b..3cc17004c 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -68,7 +68,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false From 9cc0f84ec99ccc4101900998748cc0ff164fd6ef Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 23 Oct 2021 04:13:37 +0000 Subject: [PATCH 14/17] ctc_grad_norm_type by null --- examples/aishell/s0/conf/deepspeech2.yaml | 2 +- examples/aishell/s0/conf/deepspeech2_online.yaml | 2 +- examples/aishell/s1/conf/chunk_conformer.yaml | 2 +- examples/aishell/s1/conf/conformer.yaml | 2 +- examples/librispeech/s0/conf/deepspeech2.yaml | 2 +- examples/librispeech/s0/conf/deepspeech2_online.yaml | 2 +- examples/librispeech/s1/conf/chunk_conformer.yaml | 2 +- examples/librispeech/s1/conf/chunk_transformer.yaml | 2 +- examples/librispeech/s1/conf/conformer.yaml | 2 +- examples/librispeech/s2/conf/chunk_conformer.yaml | 2 +- examples/librispeech/s2/conf/chunk_transformer.yaml | 2 +- examples/librispeech/s2/conf/conformer.yaml | 2 +- examples/ted_en_zh/t0/conf/transformer.yaml | 2 +- examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml | 2 +- examples/timit/s1/conf/transformer.yaml | 2 +- examples/tiny/s0/conf/deepspeech2.yaml | 2 +- examples/tiny/s0/conf/deepspeech2_online.yaml | 2 +- examples/tiny/s1/conf/chunk_confermer.yaml | 2 +- examples/tiny/s1/conf/chunk_transformer.yaml | 2 +- examples/tiny/s1/conf/conformer.yaml | 2 +- examples/tiny/s1/conf/transformer.yaml | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index ffefaeb31..7a198991b 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -41,7 +41,7 @@ model: use_gru: True share_rnn_weights: False blank_id: 0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null training: n_epoch: 80 diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml index cac599dc7..c15e71a31 100644 --- a/examples/aishell/s0/conf/deepspeech2_online.yaml +++ b/examples/aishell/s0/conf/deepspeech2_online.yaml @@ -43,7 +43,7 @@ model: fc_layers_size_list: -1, use_gru: False blank_id: 0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null training: n_epoch: 50 diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 9b563da27..8682538b3 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -77,7 +77,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index dfa9a4b0b..71cd044ed 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -72,7 +72,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index 47ef94211..8afaabf42 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -41,7 +41,7 @@ model: use_gru: False share_rnn_weights: True blank_id: 0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null training: n_epoch: 50 diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/s0/conf/deepspeech2_online.yaml index e2f910946..d6ab95237 100644 --- a/examples/librispeech/s0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml @@ -43,7 +43,7 @@ model: fc_layers_size_list: 512, 256 use_gru: False blank_id: 0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null training: n_epoch: 50 diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml index 9936450ba..4d0e6ceb1 100644 --- a/examples/librispeech/s1/conf/chunk_conformer.yaml +++ b/examples/librispeech/s1/conf/chunk_conformer.yaml @@ -77,7 +77,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index 44f3e5a75..c7b53f95b 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -70,7 +70,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index a05e37dd1..3bc942dc0 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -73,7 +73,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml index 872b560be..afd2b0514 100644 --- a/examples/librispeech/s2/conf/chunk_conformer.yaml +++ b/examples/librispeech/s2/conf/chunk_conformer.yaml @@ -77,7 +77,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml index 132a4f9d2..721bb7d94 100644 --- a/examples/librispeech/s2/conf/chunk_transformer.yaml +++ b/examples/librispeech/s2/conf/chunk_transformer.yaml @@ -70,7 +70,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml index bc87466eb..ef87753cb 100644 --- a/examples/librispeech/s2/conf/conformer.yaml +++ b/examples/librispeech/s2/conf/conformer.yaml @@ -73,7 +73,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml index 8c03e328d..8a7e10f0a 100644 --- a/examples/ted_en_zh/t0/conf/transformer.yaml +++ b/examples/ted_en_zh/t0/conf/transformer.yaml @@ -69,7 +69,7 @@ model: asr_weight: 0.0 ctc_weight: 0.0 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml index cbfae93e6..f8dc383f0 100644 --- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml +++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml @@ -69,7 +69,7 @@ model: asr_weight: 0.5 ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml index e138fbbe4..d3ced898e 100644 --- a/examples/timit/s1/conf/transformer.yaml +++ b/examples/timit/s1/conf/transformer.yaml @@ -67,7 +67,7 @@ model: model_conf: ctc_weight: 0.5 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index a7940cb2f..621b372cb 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -42,7 +42,7 @@ model: use_gru: False share_rnn_weights: True blank_id: 0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null training: n_epoch: 10 diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/s0/conf/deepspeech2_online.yaml index 7e30409fb..5a8294adb 100644 --- a/examples/tiny/s0/conf/deepspeech2_online.yaml +++ b/examples/tiny/s0/conf/deepspeech2_online.yaml @@ -44,7 +44,7 @@ model: fc_layers_size_list: 512, 256 use_gru: True blank_id: 0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null training: n_epoch: 10 diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index f3c7e1dd8..b14b4b212 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -77,7 +77,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index 830057548..38edbf358 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -70,7 +70,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 628e3b77e..0b06b2b72 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -73,7 +73,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 27ffcae4b..1c6f9e022 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -67,7 +67,7 @@ model: model_conf: ctc_weight: 0.3 ctc_dropoutrate: 0.0 - ctc_grad_norm_type: instance + ctc_grad_norm_type: null lsm_weight: 0.1 # label smoothing option length_normalized_loss: false From 9c62ad69d71a6d4c23155879b85a3492d40c5a1f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 22 Oct 2021 10:18:46 +0000 Subject: [PATCH 15/17] decoder with ctc prefix score --- deepspeech/__init__.py | 20 +- deepspeech/decoders/beam_search.py | 22 +- deepspeech/decoders/recog.py | 111 +++-- deepspeech/decoders/scorers/ctc.py | 4 +- .../decoders/scorers/ctc_prefix_score.py | 4 +- ...score_interface.py => scorer_interface.py} | 0 deepspeech/decoders/utils.py | 10 +- deepspeech/exps/u2_kaldi/bin/recog.py | 379 ++++++++++++++++++ deepspeech/exps/u2_kaldi/model.py | 2 - .../frontend/featurizer/text_featurizer.py | 2 +- deepspeech/models/u2/u2.py | 2 +- deepspeech/modules/decoder.py | 11 +- deepspeech/modules/mask.py | 7 +- deepspeech/training/cli.py | 6 +- deepspeech/training/trainer.py | 20 +- examples/librispeech/s2/README.md | 1 + .../librispeech/s2/conf/decode/decode.yaml | 7 + .../s2/conf/decode/decode_all.yaml | 7 + .../s2/conf/decode/decode_wo_lm.yaml | 7 + examples/librispeech/s2/local/recog.sh | 103 +++++ examples/librispeech/s2/local/test.sh | 2 +- requirements.txt | 1 + 22 files changed, 647 insertions(+), 81 deletions(-) rename deepspeech/decoders/scorers/{score_interface.py => scorer_interface.py} (100%) create mode 100644 deepspeech/exps/u2_kaldi/bin/recog.py create mode 100644 examples/librispeech/s2/conf/decode/decode.yaml create mode 100644 examples/librispeech/s2/conf/decode/decode_all.yaml create mode 100644 examples/librispeech/s2/conf/decode/decode_wo_lm.yaml create mode 100755 examples/librispeech/s2/local/recog.sh diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index 5f9ba007e..ca5b4ba5c 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -233,7 +233,7 @@ def is_broadcastable(shp1, shp2): def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - assert is_broadcastable(xs.shape, mask.shape) is True + assert is_broadcastable(xs.shape, mask.shape) is True, (xs.shape, mask.shape) bshape = paddle.broadcast_shape(xs.shape, mask.shape) mask = mask.broadcast_to(bshape) trues = paddle.ones_like(xs) * value @@ -312,18 +312,18 @@ if not hasattr(paddle.Tensor, 'type_as'): def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: - assert len(args) == 1 - if isinstance(args[0], str): # dtype - return x.astype(args[0]) - elif isinstance(args[0], paddle.Tensor): #Tensor - return x.astype(args[0].dtype) - else: # Device - return x + assert len(args) == 1 + if isinstance(args[0], str): # dtype + return x.astype(args[0]) + elif isinstance(args[0], paddle.Tensor): # Tensor + return x.astype(args[0].dtype) + else: # Device + return x if not hasattr(paddle.Tensor, 'to'): - logger.debug("register user to to paddle.Tensor, remove this when fixed!") - setattr(paddle.Tensor, 'to', to) + logger.debug("register user to to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'to', to) def func_float(x: paddle.Tensor) -> paddle.Tensor: diff --git a/deepspeech/decoders/beam_search.py b/deepspeech/decoders/beam_search.py index 5bf0d3e2f..afb8aefa5 100644 --- a/deepspeech/decoders/beam_search.py +++ b/deepspeech/decoders/beam_search.py @@ -1,7 +1,6 @@ """Beam search module.""" from itertools import chain -import logger from typing import Any from typing import Dict from typing import List @@ -141,7 +140,7 @@ class BeamSearch(paddle.nn.Layer): ] @staticmethod - def append_token(xs: paddle.Tensor, x: int) -> paddle.Tensor: + def append_token(xs: paddle.Tensor, x: Union[int, paddle.Tensor]) -> paddle.Tensor: """Append new token to prefix tokens. Args: @@ -152,8 +151,8 @@ class BeamSearch(paddle.nn.Layer): paddle.Tensor: (T+1,), New tensor contains: xs + [x] with xs.dtype and xs.device """ - x = paddle.to_tensor([x], dtype=xs.dtype, place=xs.place) - return paddle.cat((xs, x)) + x = paddle.to_tensor([x], dtype=xs.dtype) if isinstance(x, int) else x + return paddle.concat((xs, x)) def score_full( self, hyp: Hypothesis, x: paddle.Tensor @@ -306,7 +305,7 @@ class BeamSearch(paddle.nn.Layer): part_ids = paddle.arange(self.n_vocab) # no pre-beam for hyp in running_hyps: # scoring - weighted_scores = paddle.zeros(self.n_vocab, dtype=x.dtype) + weighted_scores = paddle.zeros([self.n_vocab], dtype=x.dtype) scores, states = self.score_full(hyp, x) for k in self.full_scorers: weighted_scores += self.weights[k] * scores[k] @@ -410,15 +409,20 @@ class BeamSearch(paddle.nn.Layer): best = nbest_hyps[0] for k, v in best.scores.items(): logger.info( - f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}" + f"{float(v):6.2f} * {self.weights[k]:3} = {float(v) * self.weights[k]:6.2f} for {k}" ) - logger.info(f"total log probability: {best.score:.2f}") - logger.info(f"normalized log probability: {best.score / len(best.yseq):.2f}") + logger.info(f"total log probability: {float(best.score):.2f}") + logger.info(f"normalized log probability: {float(best.score) / len(best.yseq):.2f}") logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}") if self.token_list is not None: + # logger.info( + # "best hypo: " + # + "".join([self.token_list[x] for x in best.yseq[1:-1]]) + # + "\n" + # ) logger.info( "best hypo: " - + "".join([self.token_list[x] for x in best.yseq[1:-1]]) + + "".join([self.token_list[x] for x in best.yseq[1:]]) + "\n" ) return nbest_hyps diff --git a/deepspeech/decoders/recog.py b/deepspeech/decoders/recog.py index 399c5c54f..867569aa0 100644 --- a/deepspeech/decoders/recog.py +++ b/deepspeech/decoders/recog.py @@ -2,18 +2,22 @@ import json import paddle +import yaml +from yacs.config import CfgNode +from pathlib import Path +import jsonlines # from espnet.asr.asr_utils import get_model_conf # from espnet.asr.asr_utils import torch_load # from espnet.asr.pytorch_backend.asr import load_trained_model # from espnet.nets.lm_interface import dynamic_import_lm -# from espnet.nets.asr_interface import ASRInterface +from deepspeech.models.asr_interface import ASRInterface from .utils import add_results_to_json # from .batch_beam_search import BatchBeamSearch from .beam_search import BeamSearch -from .scorer_interface import BatchScorerInterface +from .scorers.scorer_interface import BatchScorerInterface from .scorers.length_bonus import LengthBonus from deepspeech.io.reader import LoadInputsAndTargets @@ -21,6 +25,14 @@ from deepspeech.utils.log import Log logger = Log(__name__).getlog() +from deepspeech.utils.dynamic_import import dynamic_import +from deepspeech.utils.utility import print_arguments + +model_test_alias = { + "u2": "deepspeech.exps.u2.model:U2Tester", + "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Tester", +} + def recog_v2(args): """Decode with custom models that implements ScorerInterface. @@ -36,16 +48,31 @@ def recog_v2(args): raise NotImplementedError("streaming mode is not implemented") if args.word_rnnlm: raise NotImplementedError("word LM is not implemented") - + args.nprocs = args.ngpu # set_deterministic(args) - model, train_args = load_trained_model(args.model) - # assert isinstance(model, ASRInterface) - model.eval() + + #model, train_args = load_trained_model(args.model) + model_path = Path(args.model) + ckpt_dir = model_path.parent.parent + + confs = CfgNode() + confs.set_new_allowed(True) + confs.merge_from_file(args.model_conf) + + class_obj = dynamic_import(args.model_name, model_test_alias) + exp = class_obj(confs, args) + with exp.eval(): + exp.setup() + exp.restore() + char_list = exp.args.char_list + + model = exp.model + assert isinstance(model, ASRInterface) load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, - preprocess_conf=train_args.preprocess_conf + preprocess_conf=confs.collator.augmentation_config if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, @@ -56,7 +83,7 @@ def recog_v2(args): # NOTE: for a compatibility with less than 0.5.0 version models lm_model_module = getattr(lm_args, "model_module", "default") lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) - lm = lm_class(len(train_args.char_list), lm_args) + lm = lm_class(len(char_list), lm_args) torch_load(args.rnnlm, lm) lm.eval() else: @@ -67,16 +94,16 @@ def recog_v2(args): from .scorers.ngram import NgramPartScorer if args.ngram_scorer == "full": - ngram = NgramFullScorer(args.ngram_model, train_args.char_list) + ngram = NgramFullScorer(args.ngram_model, char_list) else: - ngram = NgramPartScorer(args.ngram_model, train_args.char_list) + ngram = NgramPartScorer(args.ngram_model, char_list) else: ngram = None scorers = model.scorers() scorers["lm"] = lm scorers["ngram"] = ngram - scorers["length_bonus"] = LengthBonus(len(train_args.char_list)) + scorers["length_bonus"] = LengthBonus(len(char_list)) weights = dict( decoder=1.0 - args.ctc_weight, ctc=args.ctc_weight, @@ -86,14 +113,15 @@ def recog_v2(args): ) beam_search = BeamSearch( beam_size=args.beam_size, - vocab_size=len(train_args.char_list), + vocab_size=len(char_list), weights=weights, scorers=scorers, sos=model.sos, eos=model.eos, - token_list=train_args.char_list, + token_list=char_list, pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", ) + # TODO(karita): make all scorers batchfied if args.batchsize == 1: non_batch = [ @@ -116,6 +144,7 @@ def recog_v2(args): device = "gpu:0" else: device = "cpu" + paddle.set_device(device) dtype = getattr(paddle, args.dtype) logger.info(f"Decoding device={device}, dtype={dtype}") model.to(device=device, dtype=dtype) @@ -124,31 +153,41 @@ def recog_v2(args): beam_search.eval() # read json data - with open(args.recog_json, "rb") as f: - js = json.load(f) + js = [] + with jsonlines.open(args.recog_json, "r") as reader: + for item in reader: + js.append(item) # josnlines to dict, key by 'utt' js = {item['utt']: item for item in js} new_js = {} with paddle.no_grad(): - for idx, name in enumerate(js.keys(), 1): - logger.info("(%d/%d) decoding " + name, idx, len(js.keys())) - batch = [(name, js[name])] - feat = load_inputs_and_targets(batch)[0][0] - enc = model.encode(paddle.to_tensor(feat).to(device=device, dtype=dtype)) - nbest_hyps = beam_search( - x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio - ) - nbest_hyps = [ - h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)] - ] - new_js[name] = add_results_to_json( - js[name], nbest_hyps, train_args.char_list - ) - - with open(args.result_label, "wb") as f: - f.write( - json.dumps( - {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True - ).encode("utf_8") - ) + with jsonlines.open(args.result_label, "w") as f: + for idx, name in enumerate(js.keys(), 1): + logger.info(f"({idx}/{len(js.keys())}) decoding " + name) + batch = [(name, js[name])] + feat = load_inputs_and_targets(batch)[0][0] + logger.info(f'feat: {feat.shape}') + enc = model.encode(paddle.to_tensor(feat).to(dtype)) + logger.info(f'eouts: {enc.shape}') + nbest_hyps = beam_search( + x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio + ) + nbest_hyps = [ + h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)] + ] + new_js[name] = add_results_to_json( + js[name], nbest_hyps, char_list + ) + + item = new_js[name]['output'][0] # 1-best + utt = name + ref = item['text'] + rec_text = item['rec_text'].replace('▁', ' ').replace('', '').strip() + rec_tokenid = map(int, item['rec_tokenid'].split()) + f.write({ + "utt": utt, + "refs": [ref], + "hyps": [rec_text], + "hyps_tokenid": [rec_tokenid], + }) \ No newline at end of file diff --git a/deepspeech/decoders/scorers/ctc.py b/deepspeech/decoders/scorers/ctc.py index 36b4bfd36..4871d6e12 100644 --- a/deepspeech/decoders/scorers/ctc.py +++ b/deepspeech/decoders/scorers/ctc.py @@ -15,8 +15,8 @@ import numpy as np import paddle -from .ctc_prefix_score import CTCPrefixScorer -from .ctc_prefix_score import CTCPrefixScorerPD +from .ctc_prefix_score import CTCPrefixScore +from .ctc_prefix_score import CTCPrefixScorePD from .scorer_interface import BatchPartialScorerInterface diff --git a/deepspeech/decoders/scorers/ctc_prefix_score.py b/deepspeech/decoders/scorers/ctc_prefix_score.py index 5f568c811..c85d546d3 100644 --- a/deepspeech/decoders/scorers/ctc_prefix_score.py +++ b/deepspeech/decoders/scorers/ctc_prefix_score.py @@ -6,7 +6,7 @@ import paddle import six -class CTCPrefixScorerPD(): +class CTCPrefixScorePD(): """Batch processing of CTCPrefixScore which is based on Algorithm 2 in WATANABE et al. @@ -267,7 +267,7 @@ class CTCPrefixScorerPD(): return (r_prev_new, s_prev, f_min_prev, f_max_prev) -class CTCPrefixScorer(): +class CTCPrefixScore(): """Compute CTC label sequence scores which is based on Algorithm 2 in WATANABE et al. diff --git a/deepspeech/decoders/scorers/score_interface.py b/deepspeech/decoders/scorers/scorer_interface.py similarity index 100% rename from deepspeech/decoders/scorers/score_interface.py rename to deepspeech/decoders/scorers/scorer_interface.py diff --git a/deepspeech/decoders/utils.py b/deepspeech/decoders/utils.py index 0281a78bb..f59b55d9b 100644 --- a/deepspeech/decoders/utils.py +++ b/deepspeech/decoders/utils.py @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ["end_detect"] +import numpy as np +from deepspeech.utils.log import Log +logger = Log(__name__).getlog() + +__all__ = ["end_detect", "parse_hypothesis", "add_results_to_json"] def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))): @@ -118,7 +122,7 @@ def add_results_to_json(js, nbest_hyps, char_list): # show 1-best result if n == 1: if "text" in out_dic.keys(): - logging.info("groundtruth: %s" % out_dic["text"]) - logging.info("prediction : %s" % out_dic["rec_text"]) + logger.info("groundtruth: %s" % out_dic["text"]) + logger.info("prediction : %s" % out_dic["rec_text"]) return new_js \ No newline at end of file diff --git a/deepspeech/exps/u2_kaldi/bin/recog.py b/deepspeech/exps/u2_kaldi/bin/recog.py new file mode 100644 index 000000000..60f4e58ac --- /dev/null +++ b/deepspeech/exps/u2_kaldi/bin/recog.py @@ -0,0 +1,379 @@ + +"""End-to-end speech recognition model decoding script.""" + +import configargparse +import logging +import os +import random +import sys + +import numpy as np + +from distutils.util import strtobool +from deepspeech.training.cli import default_argument_parser + +# NOTE: you need this func to generate our sphinx doc + +def get_parser(): + """Get default arguments.""" + parser = configargparse.ArgumentParser( + description="Transcribe text from speech using " + "a speech recognition model on one CPU or GPU", + config_file_parser_class=configargparse.YAMLConfigFileParser, + formatter_class=configargparse.ArgumentDefaultsHelpFormatter, + ) + parser.add( + '--model-name', + type=str, + default='u2_kaldi', + help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st') + # general configuration + parser.add("--config", is_config_file=True, help="Config file path") + parser.add( + "--config2", + is_config_file=True, + help="Second config file path that overwrites the settings in `--config`", + ) + parser.add( + "--config3", + is_config_file=True, + help="Third config file path that overwrites the settings " + "in `--config` and `--config2`", + ) + + parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs") + parser.add_argument( + "--dtype", + choices=("float16", "float32", "float64"), + default="float32", + help="Float precision (only available in --api v2)", + ) + parser.add_argument("--debugmode", type=int, default=1, help="Debugmode") + parser.add_argument("--seed", type=int, default=1, help="Random seed") + parser.add_argument("--verbose", "-V", type=int, default=2, help="Verbose option") + parser.add_argument( + "--batchsize", + type=int, + default=1, + help="Batch size for beam search (0: means no batch processing)", + ) + parser.add_argument( + "--preprocess-conf", + type=str, + default=None, + help="The configuration file for the pre-processing", + ) + parser.add_argument( + "--api", + default="v2", + choices=["v2"], + help="Beam search APIs " + "v2: Experimental API. It supports any models that implements ScorerInterface.", + ) + # task related + parser.add_argument( + "--recog-json", type=str, help="Filename of recognition data (json)" + ) + parser.add_argument( + "--result-label", + type=str, + required=True, + help="Filename of result label data (json)", + ) + # model (parameter) related + parser.add_argument( + "--model", type=str, required=True, help="Model file parameters to read" + ) + parser.add_argument( + "--model-conf", type=str, default=None, help="Model config file" + ) + parser.add_argument( + "--num-spkrs", + type=int, + default=1, + choices=[1, 2], + help="Number of speakers in the speech", + ) + parser.add_argument( + "--num-encs", default=1, type=int, help="Number of encoders in the model." + ) + # search related + parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") + parser.add_argument("--beam-size", type=int, default=1, help="Beam size") + parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty") + parser.add_argument( + "--maxlenratio", + type=float, + default=0.0, + help="""Input length ratio to obtain max output length. + If maxlenratio=0.0 (default), it uses a end-detect function + to automatically find maximum hypothesis lengths. + If maxlenratio<0.0, its absolute value is interpreted + as a constant max output length""", + ) + parser.add_argument( + "--minlenratio", + type=float, + default=0.0, + help="Input length ratio to obtain min output length", + ) + parser.add_argument( + "--ctc-weight", type=float, default=0.0, help="CTC weight in joint decoding" + ) + parser.add_argument( + "--weights-ctc-dec", + type=float, + action="append", + help="ctc weight assigned to each encoder during decoding." + "[in multi-encoder mode only]", + ) + parser.add_argument( + "--ctc-window-margin", + type=int, + default=0, + help="""Use CTC window with margin parameter to accelerate + CTC/attention decoding especially on GPU. Smaller magin + makes decoding faster, but may increase search errors. + If margin=0 (default), this function is disabled""", + ) + # transducer related + parser.add_argument( + "--search-type", + type=str, + default="default", + choices=["default", "nsc", "tsd", "alsd", "maes"], + help="""Type of beam search implementation to use during inference. + Can be either: default beam search ("default"), + N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"), + Alignment-Length Synchronous Decoding ("alsd") or + modified Adaptive Expansion Search ("maes").""", + ) + parser.add_argument( + "--nstep", + type=int, + default=1, + help="""Number of expansion steps allowed in NSC beam search or mAES + (nstep > 0 for NSC and nstep > 1 for mAES).""", + ) + parser.add_argument( + "--prefix-alpha", + type=int, + default=2, + help="Length prefix difference allowed in NSC beam search or mAES.", + ) + parser.add_argument( + "--max-sym-exp", + type=int, + default=2, + help="Number of symbol expansions allowed in TSD.", + ) + parser.add_argument( + "--u-max", + type=int, + default=400, + help="Length prefix difference allowed in ALSD.", + ) + parser.add_argument( + "--expansion-gamma", + type=float, + default=2.3, + help="Allowed logp difference for prune-by-value method in mAES.", + ) + parser.add_argument( + "--expansion-beta", + type=int, + default=2, + help="""Number of additional candidates for expanded hypotheses + selection in mAES.""", + ) + parser.add_argument( + "--score-norm", + type=strtobool, + nargs="?", + default=True, + help="Normalize final hypotheses' score by length", + ) + parser.add_argument( + "--softmax-temperature", + type=float, + default=1.0, + help="Penalization term for softmax function.", + ) + # rnnlm related + parser.add_argument( + "--rnnlm", type=str, default=None, help="RNNLM model file to read" + ) + parser.add_argument( + "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read" + ) + parser.add_argument( + "--word-rnnlm", type=str, default=None, help="Word RNNLM model file to read" + ) + parser.add_argument( + "--word-rnnlm-conf", + type=str, + default=None, + help="Word RNNLM model config file to read", + ) + parser.add_argument("--word-dict", type=str, default=None, help="Word list to read") + parser.add_argument("--lm-weight", type=float, default=0.1, help="RNNLM weight") + # ngram related + parser.add_argument( + "--ngram-model", type=str, default=None, help="ngram model file to read" + ) + parser.add_argument("--ngram-weight", type=float, default=0.1, help="ngram weight") + parser.add_argument( + "--ngram-scorer", + type=str, + default="part", + choices=("full", "part"), + help="""if the ngram is set as a part scorer, similar with CTC scorer, + ngram scorer only scores topK hypethesis. + if the ngram is set as full scorer, ngram scorer scores all hypthesis + the decoding speed of part scorer is musch faster than full one""", + ) + # streaming related + parser.add_argument( + "--streaming-mode", + type=str, + default=None, + choices=["window", "segment"], + help="""Use streaming recognizer for inference. + `--batchsize` must be set to 0 to enable this mode""", + ) + parser.add_argument("--streaming-window", type=int, default=10, help="Window size") + parser.add_argument( + "--streaming-min-blank-dur", + type=int, + default=10, + help="Minimum blank duration threshold", + ) + parser.add_argument( + "--streaming-onset-margin", type=int, default=1, help="Onset margin" + ) + parser.add_argument( + "--streaming-offset-margin", type=int, default=1, help="Offset margin" + ) + # non-autoregressive related + # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail. + parser.add_argument( + "--maskctc-n-iterations", + type=int, + default=10, + help="Number of decoding iterations." + "For Mask CTC, set 0 to predict 1 mask/iter.", + ) + parser.add_argument( + "--maskctc-probability-threshold", + type=float, + default=0.999, + help="Threshold probability for CTC output", + ) + # quantize model related + parser.add_argument( + "--quantize-config", + nargs="*", + help="Quantize config list. E.g.: --quantize-config=[Linear,LSTM,GRU]", + ) + parser.add_argument( + "--quantize-dtype", type=str, default="qint8", help="Dtype dynamic quantize" + ) + parser.add_argument( + "--quantize-asr-model", + type=bool, + default=False, + help="Quantize asr model", + ) + parser.add_argument( + "--quantize-lm-model", + type=bool, + default=False, + help="Quantize lm model", + ) + return parser + + +def main(args): + """Run the main decoding function.""" + parser = get_parser() + parser.add_argument( + "--output", metavar="CKPT_DIR", help="path to save checkpoint.") + parser.add_argument( + "--checkpoint_path", type=str, help="path to load checkpoint") + parser.add_argument( + "--dict-path", type=str, help="path to load checkpoint") + # parser = default_argument_parser(parser) + args = parser.parse_args(args) + + if args.ngpu == 0 and args.dtype == "float16": + raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.") + + # logging info + if args.verbose == 1: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + elif args.verbose == 2: + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + else: + logging.basicConfig( + level=logging.WARN, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + logging.warning("Skip DEBUG/INFO messages") + logging.info(args) + + # check CUDA_VISIBLE_DEVICES + if args.ngpu > 0: + cvd = os.environ.get("CUDA_VISIBLE_DEVICES") + if cvd is None: + logging.warning("CUDA_VISIBLE_DEVICES is not set.") + elif args.ngpu != len(cvd.split(",")): + logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.") + sys.exit(1) + + # TODO(mn5k): support of multiple GPUs + if args.ngpu > 1: + logging.error("The program only supports ngpu=1.") + sys.exit(1) + + # display PYTHONPATH + logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) + + # seed setting + random.seed(args.seed) + np.random.seed(args.seed) + logging.info("set random seed = %d" % args.seed) + + # validate rnn options + if args.rnnlm is not None and args.word_rnnlm is not None: + logging.error( + "It seems that both --rnnlm and --word-rnnlm are specified. " + "Please use either option." + ) + sys.exit(1) + + # recog + if args.num_spkrs == 1: + if args.num_encs == 1: + # Experimental API that supports custom LMs + if args.api == "v2": + from deepspeech.decoders.recog import recog_v2 + recog_v2(args) + else: + raise ValueError("Only support --api v2") + else: + if args.api == "v2": + raise NotImplementedError( + f"--num-encs {args.num_encs} > 1 is not supported in --api v2" + ) + elif args.num_spkrs == 2: + raise ValueError("asr_mix not supported.") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index 18e29b28f..3aadca856 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -317,11 +317,9 @@ class U2Trainer(Trainer): with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.feat_dim model_conf.output_dim = self.train_loader.vocab_size - model = U2Model.from_config(model_conf) if self.parallel: model = paddle.DataParallel(model) - logger.info(f"{model}") layer_tools.print_params(model, logger.info) # lr diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py index 34220432b..18ff411b0 100644 --- a/deepspeech/frontend/featurizer/text_featurizer.py +++ b/deepspeech/frontend/featurizer/text_featurizer.py @@ -207,7 +207,7 @@ class TextFeaturizer(): """Load vocabulary from file.""" vocab_list = load_dict(vocab_filepath, maskctc) assert vocab_list is not None - logger.info(f"Vocab: {pformat(vocab_list)}") + logger.debug(f"Vocab: {pformat(vocab_list)}") id2token = dict( [(idx, token) for (idx, token) in enumerate(vocab_list)]) diff --git a/deepspeech/models/u2/u2.py b/deepspeech/models/u2/u2.py index 8915cbd7d..fa517906f 100644 --- a/deepspeech/models/u2/u2.py +++ b/deepspeech/models/u2/u2.py @@ -50,7 +50,7 @@ from deepspeech.utils.tensor_utils import th_accuracy from deepspeech.utils.utility import log_add from deepspeech.utils.utility import UpdateConfig from deepspeech.models.asr_interface import ASRInterface -from deepspeech.decoders.scorers.ctc_prefix_score import CTCPrefixScorer +from deepspeech.decoders.scorers.ctc import CTCPrefixScorer __all__ = ["U2Model", "U2InferModel"] diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py index 154b7390f..ee3572ea9 100644 --- a/deepspeech/modules/decoder.py +++ b/deepspeech/modules/decoder.py @@ -28,7 +28,7 @@ from deepspeech.modules.mask import make_non_pad_mask from deepspeech.modules.mask import subsequent_mask from deepspeech.modules.mask import make_xs_mask from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward -from deepspeech.decoders.scorers.score_interface import BatchScorerInterface +from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -191,8 +191,8 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ys: (ylen,) x: (xlen, n_feat) """ - ys_mask = subsequent_mask(len(ys)).unsqueeze(0) - x_mask = make_xs_mask(x.unsqueeze(0)) + ys_mask = subsequent_mask(len(ys)).unsqueeze(0) # (B,L,L) + x_mask = make_xs_mask(x.unsqueeze(0)).unsqueeze(1) # (B,1,T) if self.selfattention_layer_type != "selfattn": # TODO(karita): implement cache logging.warning( @@ -237,9 +237,8 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ] # batch decoding - ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0) - - xs_mask = make_xs_mask(xs) + ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0) # (B,L,L) + xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T) logp, states = self.forward_one_step(xs, xs_mask, ys, ys_mask, cache=batch_state) # transpose state of [layer, batch] into [batch, layer] diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py index cffa10a7b..7ae418c03 100644 --- a/deepspeech/modules/mask.py +++ b/deepspeech/modules/mask.py @@ -24,15 +24,16 @@ __all__ = [ ] -def make_xs_mask(xs:paddle.Tensor) -> paddle.Tensor: +def make_xs_mask(xs:paddle.Tensor, pad_value=0.0) -> paddle.Tensor: """Maks mask tensor containing indices of non-padded part. Args: xs (paddle.Tensor): (B, T, D), zeros for pad. Returns: - paddle.Tensor: Mask Tensor indices of non-padded part. (B, T, D) + paddle.Tensor: Mask Tensor indices of non-padded part. (B, T) """ - pad_frame = paddle.zeros([1, 1, xs.shape[-1]], dtype=xs.dtype) + pad_frame = paddle.full([1, 1, xs.shape[-1]], pad_value, dtype=xs.dtype) mask = xs != pad_frame + mask = mask.all(axis=-1) return mask diff --git a/deepspeech/training/cli.py b/deepspeech/training/cli.py index ca9652e3c..db7076d3d 100644 --- a/deepspeech/training/cli.py +++ b/deepspeech/training/cli.py @@ -35,7 +35,7 @@ class LoadFromFile(argparse.Action): parser.parse_args(f.read().split(), namespace) -def default_argument_parser(): +def default_argument_parser(parser=None): r"""A simple yet genral argument parser for experiments with parakeet. This is used in examples with parakeet. And it is intended to be used by @@ -62,7 +62,9 @@ def default_argument_parser(): argparse.ArgumentParser the parser """ - parser = argparse.ArgumentParser() + if parser is None: + parser = argparse.ArgumentParser() + parser.register('action', 'extend', ExtendAction) parser.add_argument( '--conf', type=open, action=LoadFromFile, help="config file.") diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 6c18fa369..b2ee7a1b8 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -126,7 +126,7 @@ class Trainer(): logger.info(f"Set seed {args.seed}") # profiler and benchmark options - if self.args.benchmark_batch_size: + if hasattr(self.args, "benchmark_batch_size") and self.args.benchmark_batch_size: with UpdateConfig(self.config): self.config.collator.batch_size = self.args.benchmark_batch_size self.config.training.log_interval = 1 @@ -326,12 +326,25 @@ class Trainer(): finally: self.destory() + def restore(self): + """Resume from latest checkpoint at checkpoints in the output + directory or load a specified checkpoint. + + If ``args.checkpoint_path`` is not None, load the checkpoint, else + resume training. + """ + assert self.args.checkpoint_path + infos = self.checkpoint.load_latest_parameters( + self.model, + checkpoint_path=self.args.checkpoint_path) + return infos + def run_test(self): """Do Test/Decode""" try: with Timer("Test/Decode Done: {}"): with self.eval(): - self.resume_or_scratch() + self.restore() self.test() except KeyboardInterrupt: exit(-1) @@ -341,6 +354,7 @@ class Trainer(): try: with Timer("Export Done: {}"): with self.eval(): + self.restore() self.export() except KeyboardInterrupt: exit(-1) @@ -350,7 +364,7 @@ class Trainer(): try: with Timer("Align Done: {}"): with self.eval(): - self.resume_or_scratch() + self.restore() self.align() except KeyboardInterrupt: sys.exit(-1) diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md index 1f7c69194..6e41cc370 100644 --- a/examples/librispeech/s2/README.md +++ b/examples/librispeech/s2/README.md @@ -11,3 +11,4 @@ | test-clean | ctc_greedy_search | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 48.0 | | test-clean | ctc_prefix_beamsearch | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 47.6 | | test-clean | attention_rescore | 2620 | 52576 | 96.8 | 2.9 | 0.3 | 0.4 | 3.7 | 38.0 | +| test-clean | join_ctc_w/o_lm | 2620 | 52576 | 97.2 | 2.6 | 0.3 | 0.4 | 3.2 | 34.9 | diff --git a/examples/librispeech/s2/conf/decode/decode.yaml b/examples/librispeech/s2/conf/decode/decode.yaml new file mode 100644 index 000000000..867bf6118 --- /dev/null +++ b/examples/librispeech/s2/conf/decode/decode.yaml @@ -0,0 +1,7 @@ +batchsize: 0 +beam-size: 60 +ctc-weight: 0.4 +lm-weight: 0.0 +maxlenratio: 0.0 +minlenratio: 0.0 +penalty: 0.0 diff --git a/examples/librispeech/s2/conf/decode/decode_all.yaml b/examples/librispeech/s2/conf/decode/decode_all.yaml new file mode 100644 index 000000000..87d5f6d19 --- /dev/null +++ b/examples/librispeech/s2/conf/decode/decode_all.yaml @@ -0,0 +1,7 @@ +batchsize: 0 +beam-size: 60 +ctc-weight: 0.4 +lm-weight: 0.6 +maxlenratio: 0.0 +minlenratio: 0.0 +penalty: 0.0 \ No newline at end of file diff --git a/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml b/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml new file mode 100644 index 000000000..a82dba23b --- /dev/null +++ b/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml @@ -0,0 +1,7 @@ +batchsize: 0 +beam-size: 60 +ctc-weight: 0.4 +lm-weight: 0.0 +maxlenratio: 0.0 +minlenratio: 0.0 +penalty: 0.0 \ No newline at end of file diff --git a/examples/librispeech/s2/local/recog.sh b/examples/librispeech/s2/local/recog.sh new file mode 100755 index 000000000..0393535b4 --- /dev/null +++ b/examples/librispeech/s2/local/recog.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +set -e + +expdir=exp +datadir=data +nj=32 + +decode_config=conf/decode/decode.yaml +lang_model=rnnlm.model.best +lmexpdir=exp/train_rnnlm_pytorch_lm_transformer_cosine_batchsize32_lr1e-4_layer16_unigram5000_ngpu4/ + +lmtag='nolm' + +recog_set="test-clean test-other dev-clean dev-other" +recog_set="test-clean" + +# bpemode (unigram or bpe) +nbpe=5000 +bpemode=unigram +bpeprefix="data/bpe_${bpemode}_${nbpe}" +bpemodel=${bpeprefix}.model + +if [ $# != 3 ];then + echo "usage: ${0} config_path dict_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +dict=$2 +ckpt_prefix=$3 + +ckpt_dir=$(dirname `dirname ${ckpt_prefix}`) +echo "ckpt dir: ${ckpt_dir}" + +ckpt_tag=$(basename ${ckpt_prefix}) +echo "ckpt tag: ${ckpt_tag}" + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi +echo "chunk mode: ${chunk_mode}" +echo "decode conf: ${decode_config}" + +# download language model +#bash local/download_lm_en.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + + +pids=() # initialize pids + +for dmethd in join_ctc; do +( + echo "${dmethd} decoding" + for rtask in ${recog_set}; do + ( + echo "${rtask} dataset" + decode_dir=${ckpt_dir}/decode/decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag}_${ckpt_tag} + feat_recog_dir=${datadir} + mkdir -p ${decode_dir} + mkdir -p ${feat_recog_dir} + + # split data + split_json.sh manifest.${rtask} ${nj} + + #### use CPU for decoding + ngpu=0 + + # set batchsize 0 to disable batch decoding + ${decode_cmd} JOB=1:${nj} ${decode_dir}/log/decode.JOB.log \ + python3 -u ${BIN_DIR}/recog.py \ + --api v2 \ + --config ${decode_config} \ + --ngpu ${ngpu} \ + --batchsize 0 \ + --checkpoint_path ${ckpt_prefix} \ + --dict-path ${dict} \ + --recog-json ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} \ + --result-label ${decode_dir}/data.JOB.json \ + --model-conf ${config_path} \ + --model ${ckpt_prefix}.pdparams + + #--rnnlm ${lmexpdir}/${lang_model} \ + + score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict} + + ) & + pids+=($!) # store background pids + i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done + [ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." || true + done +) +done + +echo "Finished" + +exit 0 diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/s2/local/test.sh index a6b6cfa9d..006a13c5f 100755 --- a/examples/librispeech/s2/local/test.sh +++ b/examples/librispeech/s2/local/test.sh @@ -83,7 +83,7 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco --opts decoding.batch_size ${batch_size} \ --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} - score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${expdir}/${decode_dir} ${dict} + score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict} ) & pids+=($!) # store background pids diff --git a/requirements.txt b/requirements.txt index 2a3f06514..42fd645a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,3 +40,4 @@ pyworld jieba phkit yq +ConfigArgParse \ No newline at end of file From 1b75ca1eead918c9b6ab25426f2eea1a39a1c53f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 23 Oct 2021 15:31:43 +0000 Subject: [PATCH 16/17] avg model dump val loss mean --- utils/avg_model.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/utils/avg_model.py b/utils/avg_model.py index 1fc00cb65..7c05ec789 100755 --- a/utils/avg_model.py +++ b/utils/avg_model.py @@ -25,8 +25,8 @@ def main(args): paddle.set_device('cpu') val_scores = [] - beat_val_scores = [] - selected_epochs = [] + beat_val_scores = None + selected_epochs = None jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json') jsons = sorted(jsons, key=os.path.getmtime, reverse=True) @@ -80,9 +80,10 @@ def main(args): data = json.dumps({ "mode": 'val_best' if args.val_best else 'latest', "avg_ckpt": args.dst_model, - "ckpt": path_list, - "epoch": selected_epochs.tolist(), - "val_loss": beat_val_scores.tolist(), + "val_loss_mean": np.mean(beat_val_scores), + "ckpts": path_list, + "epochs": selected_epochs.tolist(), + "val_losses": beat_val_scores.tolist(), }) f.write(data + "\n") From 2101e20d9dc77dc5907b91a0eaa4cdc0198c599d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 23 Oct 2021 15:59:53 +0000 Subject: [PATCH 17/17] recog into decoders, format code --- deepspeech/__init__.py | 38 +- deepspeech/decoders/beam_search/__init__.py | 17 + .../decoders/beam_search/batch_beam_search.py | 17 + .../decoders/{ => beam_search}/beam_search.py | 236 ++++++----- deepspeech/decoders/recog.py | 134 +++--- deepspeech/decoders/recog_bin.py | 376 +++++++++++++++++ deepspeech/decoders/scorers/ngram.py | 5 +- deepspeech/decoders/utils.py | 7 +- deepspeech/exps/__init__.py | 49 +++ deepspeech/exps/u2_kaldi/bin/recog.py | 388 +----------------- deepspeech/exps/u2_kaldi/model.py | 5 +- .../frontend/featurizer/text_featurizer.py | 2 +- deepspeech/models/asr_interface.py | 39 +- deepspeech/models/u2/u2.py | 8 +- deepspeech/modules/decoder.py | 32 +- deepspeech/modules/mask.py | 2 +- deepspeech/training/cli.py | 2 +- deepspeech/training/trainer.py | 6 +- examples/librispeech/README.md | 6 +- examples/librispeech/s2/README.md | 14 +- .../librispeech/s2/conf/decode/decode.yaml | 2 +- examples/librispeech/s2/local/recog.sh | 24 +- examples/librispeech/s2/local/test.sh | 17 +- requirements.txt | 42 +- setup.py | 23 +- 25 files changed, 808 insertions(+), 683 deletions(-) create mode 100644 deepspeech/decoders/beam_search/__init__.py create mode 100644 deepspeech/decoders/beam_search/batch_beam_search.py rename deepspeech/decoders/{ => beam_search}/beam_search.py (74%) create mode 100644 deepspeech/decoders/recog_bin.py diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index ca5b4ba5c..da3b1acb1 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -233,7 +233,8 @@ def is_broadcastable(shp1, shp2): def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - assert is_broadcastable(xs.shape, mask.shape) is True, (xs.shape, mask.shape) + assert is_broadcastable(xs.shape, mask.shape) is True, (xs.shape, + mask.shape) bshape = paddle.broadcast_shape(xs.shape, mask.shape) mask = mask.broadcast_to(bshape) trues = paddle.ones_like(xs) * value @@ -312,18 +313,18 @@ if not hasattr(paddle.Tensor, 'type_as'): def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: - assert len(args) == 1 - if isinstance(args[0], str): # dtype - return x.astype(args[0]) - elif isinstance(args[0], paddle.Tensor): # Tensor - return x.astype(args[0].dtype) - else: # Device - return x + assert len(args) == 1 + if isinstance(args[0], str): # dtype + return x.astype(args[0]) + elif isinstance(args[0], paddle.Tensor): # Tensor + return x.astype(args[0].dtype) + else: # Device + return x if not hasattr(paddle.Tensor, 'to'): - logger.debug("register user to to paddle.Tensor, remove this when fixed!") - setattr(paddle.Tensor, 'to', to) + logger.debug("register user to to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'to', to) def func_float(x: paddle.Tensor) -> paddle.Tensor: @@ -355,7 +356,6 @@ if not hasattr(paddle.Tensor, 'tolist'): setattr(paddle.Tensor, 'tolist', tolist) - ########### hcak paddle.nn.functional ############# # hack loss def ctc_loss(logits, @@ -384,7 +384,6 @@ logger.debug( ) F.ctc_loss = ctc_loss - ########### hcak paddle.nn ############# from paddle.nn import Layer from typing import Optional @@ -394,6 +393,7 @@ from typing import Tuple from typing import Iterator from collections import OrderedDict, abc as container_abcs + class LayerDict(paddle.nn.Layer): r"""Holds submodules in a dictionary. @@ -438,7 +438,7 @@ class LayerDict(paddle.nn.Layer): return x """ - def __init__(self, modules: Optional[Mapping[str, Layer]] = None) -> None: + def __init__(self, modules: Optional[Mapping[str, Layer]]=None) -> None: super(LayerDict, self).__init__() if modules is not None: self.update(modules) @@ -505,10 +505,11 @@ class LayerDict(paddle.nn.Layer): """ if not isinstance(modules, container_abcs.Iterable): raise TypeError("LayerDict.update should be called with an " - "iterable of key/value pairs, but got " + - type(modules).__name__) + "iterable of key/value pairs, but got " + type( + modules).__name__) - if isinstance(modules, (OrderedDict, LayerDict, container_abcs.Mapping)): + if isinstance(modules, + (OrderedDict, LayerDict, container_abcs.Mapping)): for key, module in modules.items(): self[key] = module else: @@ -520,14 +521,15 @@ class LayerDict(paddle.nn.Layer): type(m).__name__) if not len(m) == 2: raise ValueError("LayerDict update sequence element " - "#" + str(j) + " has length " + str(len(m)) + - "; 2 is required") + "#" + str(j) + " has length " + str( + len(m)) + "; 2 is required") # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)] # that's too cumbersome to type correctly with overloads, so we add an ignore here self[m[0]] = m[1] # type: ignore[assignment] # remove forward alltogether to fallback on Module's _forward_unimplemented + if not hasattr(paddle.nn, 'LayerDict'): logger.debug( "register user LayerDict to paddle.nn, remove this when fixed!") diff --git a/deepspeech/decoders/beam_search/__init__.py b/deepspeech/decoders/beam_search/__init__.py new file mode 100644 index 000000000..79a1e9d30 --- /dev/null +++ b/deepspeech/decoders/beam_search/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .batch_beam_search import BatchBeamSearch +from .beam_search import beam_search +from .beam_search import BeamSearch +from .beam_search import Hypothesis diff --git a/deepspeech/decoders/beam_search/batch_beam_search.py b/deepspeech/decoders/beam_search/batch_beam_search.py new file mode 100644 index 000000000..3fc1c435f --- /dev/null +++ b/deepspeech/decoders/beam_search/batch_beam_search.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class BatchBeamSearch(): + pass diff --git a/deepspeech/decoders/beam_search.py b/deepspeech/decoders/beam_search/beam_search.py similarity index 74% rename from deepspeech/decoders/beam_search.py rename to deepspeech/decoders/beam_search/beam_search.py index afb8aefa5..8fd8f9b8f 100644 --- a/deepspeech/decoders/beam_search.py +++ b/deepspeech/decoders/beam_search/beam_search.py @@ -1,5 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Beam search module.""" - from itertools import chain from typing import Any from typing import Dict @@ -10,18 +22,18 @@ from typing import Union import paddle -from .utils import end_detect -from .scorers.scorer_interface import PartialScorerInterface -from .scorers.scorer_interface import ScorerInterface - +from ..scorers.scorer_interface import PartialScorerInterface +from ..scorers.scorer_interface import ScorerInterface +from ..utils import end_detect from deepspeech.utils.log import Log logger = Log(__name__).getlog() + class Hypothesis(NamedTuple): """Hypothesis data type.""" - yseq: paddle.Tensor # (T,) + yseq: paddle.Tensor # (T,) score: Union[float, paddle.Tensor] = 0 scores: Dict[str, Union[float, paddle.Tensor]] = dict() states: Dict[str, Any] = dict() @@ -31,25 +43,24 @@ class Hypothesis(NamedTuple): return self._replace( yseq=self.yseq.tolist(), score=float(self.score), - scores={k: float(v) for k, v in self.scores.items()}, - )._asdict() + scores={k: float(v) + for k, v in self.scores.items()}, )._asdict() class BeamSearch(paddle.nn.Layer): """Beam search implementation.""" def __init__( - self, - scorers: Dict[str, ScorerInterface], - weights: Dict[str, float], - beam_size: int, - vocab_size: int, - sos: int, - eos: int, - token_list: List[str] = None, - pre_beam_ratio: float = 1.5, - pre_beam_score_key: str = None, - ): + self, + scorers: Dict[str, ScorerInterface], + weights: Dict[str, float], + beam_size: int, + vocab_size: int, + sos: int, + eos: int, + token_list: List[str]=None, + pre_beam_ratio: float=1.5, + pre_beam_score_key: str=None, ): """Initialize beam search. Args: @@ -71,12 +82,12 @@ class BeamSearch(paddle.nn.Layer): super().__init__() # set scorers self.weights = weights - self.scorers = dict() # all = full + partial - self.full_scorers = dict() # full tokens - self.part_scorers = dict() # partial tokens + self.scorers = dict() # all = full + partial + self.full_scorers = dict() # full tokens + self.part_scorers = dict() # partial tokens # this module dict is required for recursive cast # `self.to(device, dtype)` in `recog.py` - self.nn_dict = paddle.nn.LayerDict() # nn.Layer + self.nn_dict = paddle.nn.LayerDict() # nn.Layer for k, v in scorers.items(): w = weights.get(k, 0) if w == 0 or v is None: @@ -100,20 +111,16 @@ class BeamSearch(paddle.nn.Layer): self.pre_beam_size = int(pre_beam_ratio * beam_size) self.beam_size = beam_size self.n_vocab = vocab_size - if ( - pre_beam_score_key is not None - and pre_beam_score_key != "full" - and pre_beam_score_key not in self.full_scorers - ): - raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}") + if (pre_beam_score_key is not None and pre_beam_score_key != "full" and + pre_beam_score_key not in self.full_scorers): + raise KeyError( + f"{pre_beam_score_key} is not found in {self.full_scorers}") # selected `key` scorer to do pre beam search self.pre_beam_score_key = pre_beam_score_key # do_pre_beam when need, valid and has part_scorers - self.do_pre_beam = ( - self.pre_beam_score_key is not None - and self.pre_beam_size < self.n_vocab - and len(self.part_scorers) > 0 - ) + self.do_pre_beam = (self.pre_beam_score_key is not None and + self.pre_beam_size < self.n_vocab and + len(self.part_scorers) > 0) def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]: """Get an initial hypothesis data. @@ -135,12 +142,12 @@ class BeamSearch(paddle.nn.Layer): yseq=paddle.to_tensor([self.sos], place=x.place), score=0.0, scores=init_scores, - states=init_states, - ) + states=init_states, ) ] @staticmethod - def append_token(xs: paddle.Tensor, x: Union[int, paddle.Tensor]) -> paddle.Tensor: + def append_token(xs: paddle.Tensor, + x: Union[int, paddle.Tensor]) -> paddle.Tensor: """Append new token to prefix tokens. Args: @@ -154,9 +161,8 @@ class BeamSearch(paddle.nn.Layer): x = paddle.to_tensor([x], dtype=xs.dtype) if isinstance(x, int) else x return paddle.concat((xs, x)) - def score_full( - self, hyp: Hypothesis, x: paddle.Tensor - ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: + def score_full(self, hyp: Hypothesis, x: paddle.Tensor + ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: """Score new hypothesis by `self.full_scorers`. Args: @@ -178,9 +184,11 @@ class BeamSearch(paddle.nn.Layer): scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x) return scores, states - def score_partial( - self, hyp: Hypothesis, ids: paddle.Tensor, x: paddle.Tensor - ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: + def score_partial(self, + hyp: Hypothesis, + ids: paddle.Tensor, + x: paddle.Tensor + ) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: """Score new hypothesis by `self.part_scorers`. Args: @@ -201,12 +209,12 @@ class BeamSearch(paddle.nn.Layer): states = dict() for k, d in self.part_scorers.items(): # scores[k] shape (len(ids),) - scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x) + scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], + x) return scores, states - def beam( - self, weighted_scores: paddle.Tensor, ids: paddle.Tensor - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + def beam(self, weighted_scores: paddle.Tensor, + ids: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute topk full token ids and partial token ids. Args: @@ -223,7 +231,8 @@ class BeamSearch(paddle.nn.Layer): """ # no pre beam performed, `ids` equal to `weighted_scores` if weighted_scores.size(0) == ids.size(0): - top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab + top_ids = weighted_scores.topk( + self.beam_size)[1] # index in n_vocab return top_ids, top_ids # mask pruned in pre-beam not to select in topk @@ -231,18 +240,18 @@ class BeamSearch(paddle.nn.Layer): weighted_scores[:] = -float("inf") weighted_scores[ids] = tmp # top_ids no equal to local_ids, since ids shape not same - top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab - local_ids = weighted_scores[ids].topk(self.beam_size)[1] # index in len(ids) + top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab + local_ids = weighted_scores[ids].topk( + self.beam_size)[1] # index in len(ids) return top_ids, local_ids @staticmethod def merge_scores( - prev_scores: Dict[str, float], - next_full_scores: Dict[str, paddle.Tensor], - full_idx: int, - next_part_scores: Dict[str, paddle.Tensor], - part_idx: int, - ) -> Dict[str, paddle.Tensor]: + prev_scores: Dict[str, float], + next_full_scores: Dict[str, paddle.Tensor], + full_idx: int, + next_part_scores: Dict[str, paddle.Tensor], + part_idx: int, ) -> Dict[str, paddle.Tensor]: """Merge scores for new hypothesis. Args: @@ -288,9 +297,8 @@ class BeamSearch(paddle.nn.Layer): new_states[k] = d.select_state(part_states[k], part_idx) return new_states - def search( - self, running_hyps: List[Hypothesis], x: paddle.Tensor - ) -> List[Hypothesis]: + def search(self, running_hyps: List[Hypothesis], + x: paddle.Tensor) -> List[Hypothesis]: """Search new tokens for running hypotheses and encoded speech x. Args: @@ -311,11 +319,9 @@ class BeamSearch(paddle.nn.Layer): weighted_scores += self.weights[k] * scores[k] # partial scoring if self.do_pre_beam: - pre_beam_scores = ( - weighted_scores - if self.pre_beam_score_key == "full" - else scores[self.pre_beam_score_key] - ) + pre_beam_scores = (weighted_scores + if self.pre_beam_score_key == "full" else + scores[self.pre_beam_score_key]) part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1] part_scores, part_states = self.score_partial(hyp, part_ids, x) for k in self.part_scorers: @@ -331,22 +337,21 @@ class BeamSearch(paddle.nn.Layer): Hypothesis( score=weighted_scores[j], yseq=self.append_token(hyp.yseq, j), - scores=self.merge_scores( - hyp.scores, scores, j, part_scores, part_j - ), + scores=self.merge_scores(hyp.scores, scores, j, + part_scores, part_j), states=self.merge_states(states, part_states, part_j), - ) - ) + )) # sort and prune 2 x beam -> beam - best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[ - : min(len(best_hyps), self.beam_size) - ] + best_hyps = sorted( + best_hyps, key=lambda x: x.score, + reverse=True)[:min(len(best_hyps), self.beam_size)] return best_hyps - def forward( - self, x: paddle.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0 - ) -> List[Hypothesis]: + def forward(self, + x: paddle.Tensor, + maxlenratio: float=0.0, + minlenratio: float=0.0) -> List[Hypothesis]: """Perform beam search. Args: @@ -381,9 +386,11 @@ class BeamSearch(paddle.nn.Layer): logger.debug("position " + str(i)) best = self.search(running_hyps, x) # post process of one iteration - running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps) + running_hyps = self.post_process(i, maxlen, maxlenratio, best, + ended_hyps) # end detection - if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i): + if maxlenratio == 0.0 and end_detect( + [h.asdict() for h in ended_hyps], i): logger.info(f"end detected at {i}") break if len(running_hyps) == 0: @@ -395,15 +402,10 @@ class BeamSearch(paddle.nn.Layer): nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True) # check the number of hypotheses reaching to eos if len(nbest_hyps) == 0: - logger.warning( - "there is no N-best results, perform recognition " - "again with smaller minlenratio." - ) - return ( - [] - if minlenratio < 0.1 - else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1)) - ) + logger.warning("there is no N-best results, perform recognition " + "again with smaller minlenratio.") + return ([] if minlenratio < 0.1 else + self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))) # report the best result best = nbest_hyps[0] @@ -412,7 +414,9 @@ class BeamSearch(paddle.nn.Layer): f"{float(v):6.2f} * {self.weights[k]:3} = {float(v) * self.weights[k]:6.2f} for {k}" ) logger.info(f"total log probability: {float(best.score):.2f}") - logger.info(f"normalized log probability: {float(best.score) / len(best.yseq):.2f}") + logger.info( + f"normalized log probability: {float(best.score) / len(best.yseq):.2f}" + ) logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}") if self.token_list is not None: # logger.info( @@ -420,21 +424,17 @@ class BeamSearch(paddle.nn.Layer): # + "".join([self.token_list[x] for x in best.yseq[1:-1]]) # + "\n" # ) - logger.info( - "best hypo: " - + "".join([self.token_list[x] for x in best.yseq[1:]]) - + "\n" - ) + logger.info("best hypo: " + "".join( + [self.token_list[x] for x in best.yseq[1:]]) + "\n") return nbest_hyps def post_process( - self, - i: int, - maxlen: int, - maxlenratio: float, - running_hyps: List[Hypothesis], - ended_hyps: List[Hypothesis], - ) -> List[Hypothesis]: + self, + i: int, + maxlen: int, + maxlenratio: float, + running_hyps: List[Hypothesis], + ended_hyps: List[Hypothesis], ) -> List[Hypothesis]: """Perform post-processing of beam search iterations. Args: @@ -450,10 +450,8 @@ class BeamSearch(paddle.nn.Layer): """ logger.debug(f"the number of running hypotheses: {len(running_hyps)}") if self.token_list is not None: - logger.debug( - "best hypo: " - + "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]]) - ) + logger.debug("best hypo: " + "".join( + [self.token_list[x] for x in running_hyps[0].yseq[1:]])) # add eos in the final loop to avoid that there are no ended hyps if i == maxlen - 1: logger.info("adding in the last position in the loop") @@ -468,7 +466,8 @@ class BeamSearch(paddle.nn.Layer): for hyp in running_hyps: if hyp.yseq[-1] == self.eos: # e.g., Word LM needs to add final score - for k, d in chain(self.full_scorers.items(), self.part_scorers.items()): + for k, d in chain(self.full_scorers.items(), + self.part_scorers.items()): s = d.final_score(hyp.states[k]) hyp.scores[k] += s hyp = hyp._replace(score=hyp.score + self.weights[k] * s) @@ -479,19 +478,18 @@ class BeamSearch(paddle.nn.Layer): def beam_search( - x: paddle.Tensor, - sos: int, - eos: int, - beam_size: int, - vocab_size: int, - scorers: Dict[str, ScorerInterface], - weights: Dict[str, float], - token_list: List[str] = None, - maxlenratio: float = 0.0, - minlenratio: float = 0.0, - pre_beam_ratio: float = 1.5, - pre_beam_score_key: str = "full", -) -> list: + x: paddle.Tensor, + sos: int, + eos: int, + beam_size: int, + vocab_size: int, + scorers: Dict[str, ScorerInterface], + weights: Dict[str, float], + token_list: List[str]=None, + maxlenratio: float=0.0, + minlenratio: float=0.0, + pre_beam_ratio: float=1.5, + pre_beam_score_key: str="full", ) -> list: """Perform beam search with scorers. Args: @@ -527,6 +525,6 @@ def beam_search( pre_beam_score_key=pre_beam_score_key, sos=sos, eos=eos, - token_list=token_list, - ).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio) + token_list=token_list, ).forward( + x=x, maxlenratio=maxlenratio, minlenratio=minlenratio) return [h.asdict() for h in ret] diff --git a/deepspeech/decoders/recog.py b/deepspeech/decoders/recog.py index 867569aa0..c8df65d68 100644 --- a/deepspeech/decoders/recog.py +++ b/deepspeech/decoders/recog.py @@ -1,37 +1,57 @@ -"""V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`.""" - +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`.""" import json +from pathlib import Path + +import jsonlines import paddle import yaml from yacs.config import CfgNode -from pathlib import Path -import jsonlines -# from espnet.asr.asr_utils import get_model_conf -# from espnet.asr.asr_utils import torch_load -# from espnet.asr.pytorch_backend.asr import load_trained_model -# from espnet.nets.lm_interface import dynamic_import_lm - -from deepspeech.models.asr_interface import ASRInterface - -from .utils import add_results_to_json -# from .batch_beam_search import BatchBeamSearch +from .beam_search import BatchBeamSearch from .beam_search import BeamSearch -from .scorers.scorer_interface import BatchScorerInterface from .scorers.length_bonus import LengthBonus - +from .scorers.scorer_interface import BatchScorerInterface +from .utils import add_results_to_json +from deepspeech.exps import dynamic_import_tester from deepspeech.io.reader import LoadInputsAndTargets +from deepspeech.models.asr_interface import ASRInterface from deepspeech.utils.log import Log +# from espnet.asr.asr_utils import get_model_conf +# from espnet.asr.asr_utils import torch_load +# from espnet.nets.lm_interface import dynamic_import_lm + logger = Log(__name__).getlog() +# NOTE: you need this func to generate our sphinx doc -from deepspeech.utils.dynamic_import import dynamic_import -from deepspeech.utils.utility import print_arguments -model_test_alias = { - "u2": "deepspeech.exps.u2.model:U2Tester", - "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Tester", -} +def load_trained_model(args): + args.nprocs = args.ngpu + confs = CfgNode() + confs.set_new_allowed(True) + confs.merge_from_file(args.model_conf) + class_obj = dynamic_import_tester(args.model_name) + exp = class_obj(confs, args) + with exp.eval(): + exp.setup() + exp.restore() + char_list = exp.args.char_list + model = exp.model + return model, char_list, exp, confs + def recog_v2(args): """Decode with custom models that implements ScorerInterface. @@ -48,33 +68,17 @@ def recog_v2(args): raise NotImplementedError("streaming mode is not implemented") if args.word_rnnlm: raise NotImplementedError("word LM is not implemented") - args.nprocs = args.ngpu - # set_deterministic(args) - - #model, train_args = load_trained_model(args.model) - model_path = Path(args.model) - ckpt_dir = model_path.parent.parent - - confs = CfgNode() - confs.set_new_allowed(True) - confs.merge_from_file(args.model_conf) - - class_obj = dynamic_import(args.model_name, model_test_alias) - exp = class_obj(confs, args) - with exp.eval(): - exp.setup() - exp.restore() - char_list = exp.args.char_list - model = exp.model + # set_deterministic(args) + model, char_list, exp, confs = load_trained_model(args) assert isinstance(model, ASRInterface) + load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=confs.collator.augmentation_config - if args.preprocess_conf is None - else args.preprocess_conf, + if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) @@ -100,7 +104,7 @@ def recog_v2(args): else: ngram = None - scorers = model.scorers() + scorers = model.scorers() # decoder scorers["lm"] = lm scorers["ngram"] = ngram scorers["length_bonus"] = LengthBonus(len(char_list)) @@ -125,18 +129,15 @@ def recog_v2(args): # TODO(karita): make all scorers batchfied if args.batchsize == 1: non_batch = [ - k - for k, v in beam_search.full_scorers.items() + k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logger.info("BatchBeamSearch implementation is selected.") else: - logger.warning( - f"As non-batch scorers {non_batch} are found, " - f"fall back to non-batch implementation." - ) + logger.warning(f"As non-batch scorers {non_batch} are found, " + f"fall back to non-batch implementation.") if args.ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") @@ -157,7 +158,7 @@ def recog_v2(args): with jsonlines.open(args.recog_json, "r") as reader: for item in reader: js.append(item) - # josnlines to dict, key by 'utt' + # jsonlines to dict, key by 'utt', value by jsonline js = {item['utt']: item for item in js} new_js = {} @@ -169,25 +170,26 @@ def recog_v2(args): feat = load_inputs_and_targets(batch)[0][0] logger.info(f'feat: {feat.shape}') enc = model.encode(paddle.to_tensor(feat).to(dtype)) - logger.info(f'eouts: {enc.shape}') - nbest_hyps = beam_search( - x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio - ) + logger.info(f'eout: {enc.shape}') + nbest_hyps = beam_search(x=enc, + maxlenratio=args.maxlenratio, + minlenratio=args.minlenratio) nbest_hyps = [ - h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)] + h.asdict() + for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)] ] - new_js[name] = add_results_to_json( - js[name], nbest_hyps, char_list - ) + new_js[name] = add_results_to_json(js[name], nbest_hyps, + char_list) - item = new_js[name]['output'][0] # 1-best - utt = name + item = new_js[name]['output'][0] # 1-best ref = item['text'] - rec_text = item['rec_text'].replace('▁', ' ').replace('', '').strip() - rec_tokenid = map(int, item['rec_tokenid'].split()) + rec_text = item['rec_text'].replace('▁', + ' ').replace('', + '').strip() + rec_tokenid = list(map(int, item['rec_tokenid'].split())) f.write({ - "utt": utt, - "refs": [ref], - "hyps": [rec_text], - "hyps_tokenid": [rec_tokenid], - }) \ No newline at end of file + "utt": name, + "refs": [ref], + "hyps": [rec_text], + "hyps_tokenid": [rec_tokenid], + }) diff --git a/deepspeech/decoders/recog_bin.py b/deepspeech/decoders/recog_bin.py new file mode 100644 index 000000000..567dfecde --- /dev/null +++ b/deepspeech/decoders/recog_bin.py @@ -0,0 +1,376 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""End-to-end speech recognition model decoding script.""" +import logging +import os +import random +import sys +from distutils.util import strtobool + +import configargparse +import numpy as np + +from .recog import recog_v2 + + +def get_parser(): + """Get default arguments.""" + parser = configargparse.ArgumentParser( + description="Transcribe text from speech using " + "a speech recognition model on one CPU or GPU", + config_file_parser_class=configargparse.YAMLConfigFileParser, + formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) + parser.add( + '--model-name', + type=str, + default='u2_kaldi', + help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st') + # general configuration + parser.add("--config", is_config_file=True, help="Config file path") + parser.add( + "--config2", + is_config_file=True, + help="Second config file path that overwrites the settings in `--config`", + ) + parser.add( + "--config3", + is_config_file=True, + help="Third config file path that overwrites the settings " + "in `--config` and `--config2`", ) + + parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs") + parser.add_argument( + "--dtype", + choices=("float16", "float32", "float64"), + default="float32", + help="Float precision (only available in --api v2)", ) + parser.add_argument("--debugmode", type=int, default=1, help="Debugmode") + parser.add_argument("--seed", type=int, default=1, help="Random seed") + parser.add_argument( + "--verbose", "-V", type=int, default=2, help="Verbose option") + parser.add_argument( + "--batchsize", + type=int, + default=1, + help="Batch size for beam search (0: means no batch processing)", ) + parser.add_argument( + "--preprocess-conf", + type=str, + default=None, + help="The configuration file for the pre-processing", ) + parser.add_argument( + "--api", + default="v2", + choices=["v2"], + help="Beam search APIs " + "v2: Experimental API. It supports any models that implements ScorerInterface.", + ) + # task related + parser.add_argument( + "--recog-json", type=str, help="Filename of recognition data (json)") + parser.add_argument( + "--result-label", + type=str, + required=True, + help="Filename of result label data (json)", ) + # model (parameter) related + parser.add_argument( + "--model", + type=str, + required=True, + help="Model file parameters to read") + parser.add_argument( + "--model-conf", type=str, default=None, help="Model config file") + parser.add_argument( + "--num-spkrs", + type=int, + default=1, + choices=[1, 2], + help="Number of speakers in the speech", ) + parser.add_argument( + "--num-encs", + default=1, + type=int, + help="Number of encoders in the model.") + # search related + parser.add_argument( + "--nbest", type=int, default=1, help="Output N-best hypotheses") + parser.add_argument("--beam-size", type=int, default=1, help="Beam size") + parser.add_argument( + "--penalty", type=float, default=0.0, help="Incertion penalty") + parser.add_argument( + "--maxlenratio", + type=float, + default=0.0, + help="""Input length ratio to obtain max output length. + If maxlenratio=0.0 (default), it uses a end-detect function + to automatically find maximum hypothesis lengths. + If maxlenratio<0.0, its absolute value is interpreted + as a constant max output length""", ) + parser.add_argument( + "--minlenratio", + type=float, + default=0.0, + help="Input length ratio to obtain min output length", ) + parser.add_argument( + "--ctc-weight", + type=float, + default=0.0, + help="CTC weight in joint decoding") + parser.add_argument( + "--weights-ctc-dec", + type=float, + action="append", + help="ctc weight assigned to each encoder during decoding." + "[in multi-encoder mode only]", ) + parser.add_argument( + "--ctc-window-margin", + type=int, + default=0, + help="""Use CTC window with margin parameter to accelerate + CTC/attention decoding especially on GPU. Smaller magin + makes decoding faster, but may increase search errors. + If margin=0 (default), this function is disabled""", ) + # transducer related + parser.add_argument( + "--search-type", + type=str, + default="default", + choices=["default", "nsc", "tsd", "alsd", "maes"], + help="""Type of beam search implementation to use during inference. + Can be either: default beam search ("default"), + N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"), + Alignment-Length Synchronous Decoding ("alsd") or + modified Adaptive Expansion Search ("maes").""", ) + parser.add_argument( + "--nstep", + type=int, + default=1, + help="""Number of expansion steps allowed in NSC beam search or mAES + (nstep > 0 for NSC and nstep > 1 for mAES).""", ) + parser.add_argument( + "--prefix-alpha", + type=int, + default=2, + help="Length prefix difference allowed in NSC beam search or mAES.", ) + parser.add_argument( + "--max-sym-exp", + type=int, + default=2, + help="Number of symbol expansions allowed in TSD.", ) + parser.add_argument( + "--u-max", + type=int, + default=400, + help="Length prefix difference allowed in ALSD.", ) + parser.add_argument( + "--expansion-gamma", + type=float, + default=2.3, + help="Allowed logp difference for prune-by-value method in mAES.", ) + parser.add_argument( + "--expansion-beta", + type=int, + default=2, + help="""Number of additional candidates for expanded hypotheses + selection in mAES.""", ) + parser.add_argument( + "--score-norm", + type=strtobool, + nargs="?", + default=True, + help="Normalize final hypotheses' score by length", ) + parser.add_argument( + "--softmax-temperature", + type=float, + default=1.0, + help="Penalization term for softmax function.", ) + # rnnlm related + parser.add_argument( + "--rnnlm", type=str, default=None, help="RNNLM model file to read") + parser.add_argument( + "--rnnlm-conf", + type=str, + default=None, + help="RNNLM model config file to read") + parser.add_argument( + "--word-rnnlm", + type=str, + default=None, + help="Word RNNLM model file to read") + parser.add_argument( + "--word-rnnlm-conf", + type=str, + default=None, + help="Word RNNLM model config file to read", ) + parser.add_argument( + "--word-dict", type=str, default=None, help="Word list to read") + parser.add_argument( + "--lm-weight", type=float, default=0.1, help="RNNLM weight") + # ngram related + parser.add_argument( + "--ngram-model", + type=str, + default=None, + help="ngram model file to read") + parser.add_argument( + "--ngram-weight", type=float, default=0.1, help="ngram weight") + parser.add_argument( + "--ngram-scorer", + type=str, + default="part", + choices=("full", "part"), + help="""if the ngram is set as a part scorer, similar with CTC scorer, + ngram scorer only scores topK hypethesis. + if the ngram is set as full scorer, ngram scorer scores all hypthesis + the decoding speed of part scorer is musch faster than full one""", + ) + # streaming related + parser.add_argument( + "--streaming-mode", + type=str, + default=None, + choices=["window", "segment"], + help="""Use streaming recognizer for inference. + `--batchsize` must be set to 0 to enable this mode""", ) + parser.add_argument( + "--streaming-window", type=int, default=10, help="Window size") + parser.add_argument( + "--streaming-min-blank-dur", + type=int, + default=10, + help="Minimum blank duration threshold", ) + parser.add_argument( + "--streaming-onset-margin", type=int, default=1, help="Onset margin") + parser.add_argument( + "--streaming-offset-margin", type=int, default=1, help="Offset margin") + # non-autoregressive related + # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail. + parser.add_argument( + "--maskctc-n-iterations", + type=int, + default=10, + help="Number of decoding iterations." + "For Mask CTC, set 0 to predict 1 mask/iter.", ) + parser.add_argument( + "--maskctc-probability-threshold", + type=float, + default=0.999, + help="Threshold probability for CTC output", ) + # quantize model related + parser.add_argument( + "--quantize-config", + nargs="*", + help="Quantize config list. E.g.: --quantize-config=[Linear,LSTM,GRU]", + ) + parser.add_argument( + "--quantize-dtype", + type=str, + default="qint8", + help="Dtype dynamic quantize") + parser.add_argument( + "--quantize-asr-model", + type=bool, + default=False, + help="Quantize asr model", ) + parser.add_argument( + "--quantize-lm-model", + type=bool, + default=False, + help="Quantize lm model", ) + return parser + + +def main(args): + """Run the main decoding function.""" + parser = get_parser() + parser.add_argument( + "--output", metavar="CKPT_DIR", help="path to save checkpoint.") + parser.add_argument( + "--checkpoint_path", type=str, help="path to load checkpoint") + parser.add_argument("--dict-path", type=str, help="path to load checkpoint") + args = parser.parse_args(args) + + if args.ngpu == 0 and args.dtype == "float16": + raise ValueError( + f"--dtype {args.dtype} does not support the CPU backend.") + + # logging info + if args.verbose == 1: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + elif args.verbose == 2: + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + else: + logging.basicConfig( + level=logging.WARN, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + logging.warning("Skip DEBUG/INFO messages") + logging.info(args) + + # check CUDA_VISIBLE_DEVICES + if args.ngpu > 0: + cvd = os.environ.get("CUDA_VISIBLE_DEVICES") + if cvd is None: + logging.warning("CUDA_VISIBLE_DEVICES is not set.") + elif args.ngpu != len(cvd.split(",")): + logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.") + sys.exit(1) + + # TODO(mn5k): support of multiple GPUs + if args.ngpu > 1: + logging.error("The program only supports ngpu=1.") + sys.exit(1) + + # display PYTHONPATH + logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) + + # seed setting + random.seed(args.seed) + np.random.seed(args.seed) + logging.info("set random seed = %d" % args.seed) + + # validate rnn options + if args.rnnlm is not None and args.word_rnnlm is not None: + logging.error( + "It seems that both --rnnlm and --word-rnnlm are specified. " + "Please use either option.") + sys.exit(1) + + # recog + if args.num_spkrs == 1: + if args.num_encs == 1: + # Experimental API that supports custom LMs + if args.api == "v2": + from deepspeech.decoders.recog import recog_v2 + recog_v2(args) + else: + raise ValueError("Only support --api v2") + else: + if args.api == "v2": + raise NotImplementedError( + f"--num-encs {args.num_encs} > 1 is not supported in --api v2" + ) + elif args.num_spkrs == 2: + raise ValueError("asr_mix not supported.") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/deepspeech/decoders/scorers/ngram.py b/deepspeech/decoders/scorers/ngram.py index 050a8c81f..a34d82483 100644 --- a/deepspeech/decoders/scorers/ngram.py +++ b/deepspeech/decoders/scorers/ngram.py @@ -85,8 +85,9 @@ class NgramFullScorer(Ngrambase, BatchScorerInterface): and next state list for ys. """ - return self.score_partial_( - y, paddle.to_tensor(range(self.charlen)), state, x) + return self.score_partial_(y, + paddle.to_tensor(range(self.charlen)), state, + x) class NgramPartScorer(Ngrambase, PartialScorerInterface): diff --git a/deepspeech/decoders/utils.py b/deepspeech/decoders/utils.py index f59b55d9b..3ed9c5da5 100644 --- a/deepspeech/decoders/utils.py +++ b/deepspeech/decoders/utils.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np + from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -98,7 +98,8 @@ def add_results_to_json(js, nbest_hyps, char_list): for n, hyp in enumerate(nbest_hyps, 1): # parse hypothesis - rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list) + rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, + char_list) # copy ground-truth if len(js["output"]) > 0: @@ -125,4 +126,4 @@ def add_results_to_json(js, nbest_hyps, char_list): logger.info("groundtruth: %s" % out_dic["text"]) logger.info("prediction : %s" % out_dic["rec_text"]) - return new_js \ No newline at end of file + return new_js diff --git a/deepspeech/exps/__init__.py b/deepspeech/exps/__init__.py index 185a92b8d..299530146 100644 --- a/deepspeech/exps/__init__.py +++ b/deepspeech/exps/__init__.py @@ -11,3 +11,52 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from deepspeech.training.trainer import Trainer +from deepspeech.utils.dynamic_import import dynamic_import + +model_trainer_alias = { + "ds2": "deepspeech.exp.deepspeech2.model:DeepSpeech2Trainer", + "u2": "deepspeech.exps.u2.model:U2Trainer", + "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Trainer", + "u2_st": "deepspeech.exps.u2_st.model:U2STTrainer", +} + + +def dynamic_import_trainer(module): + """Import Trainer dynamically. + + Args: + module (str): trainer name. e.g., ds2, u2, u2_kaldi + + Returns: + type: Trainer class + + """ + model_class = dynamic_import(module, model_trainer_alias) + assert issubclass(model_class, + Trainer), f"{module} does not implement Trainer" + return model_class + + +model_tester_alias = { + "ds2": "deepspeech.exp.deepspeech2.model:DeepSpeech2Tester", + "u2": "deepspeech.exps.u2.model:U2Tester", + "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Tester", + "u2_st": "deepspeech.exps.u2_st.model:U2STTester", +} + + +def dynamic_import_tester(module): + """Import Tester dynamically. + + Args: + module (str): tester name. e.g., ds2, u2, u2_kaldi + + Returns: + type: Tester class + + """ + model_class = dynamic_import(module, model_tester_alias) + assert issubclass(model_class, + Trainer), f"{module} does not implement Tester" + return model_class diff --git a/deepspeech/exps/u2_kaldi/bin/recog.py b/deepspeech/exps/u2_kaldi/bin/recog.py index 60f4e58ac..e94a1ab18 100644 --- a/deepspeech/exps/u2_kaldi/bin/recog.py +++ b/deepspeech/exps/u2_kaldi/bin/recog.py @@ -1,379 +1,19 @@ - -"""End-to-end speech recognition model decoding script.""" - -import configargparse -import logging -import os -import random +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import sys -import numpy as np - -from distutils.util import strtobool -from deepspeech.training.cli import default_argument_parser - -# NOTE: you need this func to generate our sphinx doc - -def get_parser(): - """Get default arguments.""" - parser = configargparse.ArgumentParser( - description="Transcribe text from speech using " - "a speech recognition model on one CPU or GPU", - config_file_parser_class=configargparse.YAMLConfigFileParser, - formatter_class=configargparse.ArgumentDefaultsHelpFormatter, - ) - parser.add( - '--model-name', - type=str, - default='u2_kaldi', - help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st') - # general configuration - parser.add("--config", is_config_file=True, help="Config file path") - parser.add( - "--config2", - is_config_file=True, - help="Second config file path that overwrites the settings in `--config`", - ) - parser.add( - "--config3", - is_config_file=True, - help="Third config file path that overwrites the settings " - "in `--config` and `--config2`", - ) - - parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs") - parser.add_argument( - "--dtype", - choices=("float16", "float32", "float64"), - default="float32", - help="Float precision (only available in --api v2)", - ) - parser.add_argument("--debugmode", type=int, default=1, help="Debugmode") - parser.add_argument("--seed", type=int, default=1, help="Random seed") - parser.add_argument("--verbose", "-V", type=int, default=2, help="Verbose option") - parser.add_argument( - "--batchsize", - type=int, - default=1, - help="Batch size for beam search (0: means no batch processing)", - ) - parser.add_argument( - "--preprocess-conf", - type=str, - default=None, - help="The configuration file for the pre-processing", - ) - parser.add_argument( - "--api", - default="v2", - choices=["v2"], - help="Beam search APIs " - "v2: Experimental API. It supports any models that implements ScorerInterface.", - ) - # task related - parser.add_argument( - "--recog-json", type=str, help="Filename of recognition data (json)" - ) - parser.add_argument( - "--result-label", - type=str, - required=True, - help="Filename of result label data (json)", - ) - # model (parameter) related - parser.add_argument( - "--model", type=str, required=True, help="Model file parameters to read" - ) - parser.add_argument( - "--model-conf", type=str, default=None, help="Model config file" - ) - parser.add_argument( - "--num-spkrs", - type=int, - default=1, - choices=[1, 2], - help="Number of speakers in the speech", - ) - parser.add_argument( - "--num-encs", default=1, type=int, help="Number of encoders in the model." - ) - # search related - parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") - parser.add_argument("--beam-size", type=int, default=1, help="Beam size") - parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty") - parser.add_argument( - "--maxlenratio", - type=float, - default=0.0, - help="""Input length ratio to obtain max output length. - If maxlenratio=0.0 (default), it uses a end-detect function - to automatically find maximum hypothesis lengths. - If maxlenratio<0.0, its absolute value is interpreted - as a constant max output length""", - ) - parser.add_argument( - "--minlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain min output length", - ) - parser.add_argument( - "--ctc-weight", type=float, default=0.0, help="CTC weight in joint decoding" - ) - parser.add_argument( - "--weights-ctc-dec", - type=float, - action="append", - help="ctc weight assigned to each encoder during decoding." - "[in multi-encoder mode only]", - ) - parser.add_argument( - "--ctc-window-margin", - type=int, - default=0, - help="""Use CTC window with margin parameter to accelerate - CTC/attention decoding especially on GPU. Smaller magin - makes decoding faster, but may increase search errors. - If margin=0 (default), this function is disabled""", - ) - # transducer related - parser.add_argument( - "--search-type", - type=str, - default="default", - choices=["default", "nsc", "tsd", "alsd", "maes"], - help="""Type of beam search implementation to use during inference. - Can be either: default beam search ("default"), - N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"), - Alignment-Length Synchronous Decoding ("alsd") or - modified Adaptive Expansion Search ("maes").""", - ) - parser.add_argument( - "--nstep", - type=int, - default=1, - help="""Number of expansion steps allowed in NSC beam search or mAES - (nstep > 0 for NSC and nstep > 1 for mAES).""", - ) - parser.add_argument( - "--prefix-alpha", - type=int, - default=2, - help="Length prefix difference allowed in NSC beam search or mAES.", - ) - parser.add_argument( - "--max-sym-exp", - type=int, - default=2, - help="Number of symbol expansions allowed in TSD.", - ) - parser.add_argument( - "--u-max", - type=int, - default=400, - help="Length prefix difference allowed in ALSD.", - ) - parser.add_argument( - "--expansion-gamma", - type=float, - default=2.3, - help="Allowed logp difference for prune-by-value method in mAES.", - ) - parser.add_argument( - "--expansion-beta", - type=int, - default=2, - help="""Number of additional candidates for expanded hypotheses - selection in mAES.""", - ) - parser.add_argument( - "--score-norm", - type=strtobool, - nargs="?", - default=True, - help="Normalize final hypotheses' score by length", - ) - parser.add_argument( - "--softmax-temperature", - type=float, - default=1.0, - help="Penalization term for softmax function.", - ) - # rnnlm related - parser.add_argument( - "--rnnlm", type=str, default=None, help="RNNLM model file to read" - ) - parser.add_argument( - "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read" - ) - parser.add_argument( - "--word-rnnlm", type=str, default=None, help="Word RNNLM model file to read" - ) - parser.add_argument( - "--word-rnnlm-conf", - type=str, - default=None, - help="Word RNNLM model config file to read", - ) - parser.add_argument("--word-dict", type=str, default=None, help="Word list to read") - parser.add_argument("--lm-weight", type=float, default=0.1, help="RNNLM weight") - # ngram related - parser.add_argument( - "--ngram-model", type=str, default=None, help="ngram model file to read" - ) - parser.add_argument("--ngram-weight", type=float, default=0.1, help="ngram weight") - parser.add_argument( - "--ngram-scorer", - type=str, - default="part", - choices=("full", "part"), - help="""if the ngram is set as a part scorer, similar with CTC scorer, - ngram scorer only scores topK hypethesis. - if the ngram is set as full scorer, ngram scorer scores all hypthesis - the decoding speed of part scorer is musch faster than full one""", - ) - # streaming related - parser.add_argument( - "--streaming-mode", - type=str, - default=None, - choices=["window", "segment"], - help="""Use streaming recognizer for inference. - `--batchsize` must be set to 0 to enable this mode""", - ) - parser.add_argument("--streaming-window", type=int, default=10, help="Window size") - parser.add_argument( - "--streaming-min-blank-dur", - type=int, - default=10, - help="Minimum blank duration threshold", - ) - parser.add_argument( - "--streaming-onset-margin", type=int, default=1, help="Onset margin" - ) - parser.add_argument( - "--streaming-offset-margin", type=int, default=1, help="Offset margin" - ) - # non-autoregressive related - # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail. - parser.add_argument( - "--maskctc-n-iterations", - type=int, - default=10, - help="Number of decoding iterations." - "For Mask CTC, set 0 to predict 1 mask/iter.", - ) - parser.add_argument( - "--maskctc-probability-threshold", - type=float, - default=0.999, - help="Threshold probability for CTC output", - ) - # quantize model related - parser.add_argument( - "--quantize-config", - nargs="*", - help="Quantize config list. E.g.: --quantize-config=[Linear,LSTM,GRU]", - ) - parser.add_argument( - "--quantize-dtype", type=str, default="qint8", help="Dtype dynamic quantize" - ) - parser.add_argument( - "--quantize-asr-model", - type=bool, - default=False, - help="Quantize asr model", - ) - parser.add_argument( - "--quantize-lm-model", - type=bool, - default=False, - help="Quantize lm model", - ) - return parser - - -def main(args): - """Run the main decoding function.""" - parser = get_parser() - parser.add_argument( - "--output", metavar="CKPT_DIR", help="path to save checkpoint.") - parser.add_argument( - "--checkpoint_path", type=str, help="path to load checkpoint") - parser.add_argument( - "--dict-path", type=str, help="path to load checkpoint") - # parser = default_argument_parser(parser) - args = parser.parse_args(args) - - if args.ngpu == 0 and args.dtype == "float16": - raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.") - - # logging info - if args.verbose == 1: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - elif args.verbose == 2: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - else: - logging.basicConfig( - level=logging.WARN, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - logging.warning("Skip DEBUG/INFO messages") - logging.info(args) - - # check CUDA_VISIBLE_DEVICES - if args.ngpu > 0: - cvd = os.environ.get("CUDA_VISIBLE_DEVICES") - if cvd is None: - logging.warning("CUDA_VISIBLE_DEVICES is not set.") - elif args.ngpu != len(cvd.split(",")): - logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.") - sys.exit(1) - - # TODO(mn5k): support of multiple GPUs - if args.ngpu > 1: - logging.error("The program only supports ngpu=1.") - sys.exit(1) - - # display PYTHONPATH - logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) - - # seed setting - random.seed(args.seed) - np.random.seed(args.seed) - logging.info("set random seed = %d" % args.seed) - - # validate rnn options - if args.rnnlm is not None and args.word_rnnlm is not None: - logging.error( - "It seems that both --rnnlm and --word-rnnlm are specified. " - "Please use either option." - ) - sys.exit(1) - - # recog - if args.num_spkrs == 1: - if args.num_encs == 1: - # Experimental API that supports custom LMs - if args.api == "v2": - from deepspeech.decoders.recog import recog_v2 - recog_v2(args) - else: - raise ValueError("Only support --api v2") - else: - if args.api == "v2": - raise NotImplementedError( - f"--num-encs {args.num_encs} > 1 is not supported in --api v2" - ) - elif args.num_spkrs == 2: - raise ValueError("asr_mix not supported.") - +from deepspeech.decoders.recog_bin import main if __name__ == "__main__": main(sys.argv[1:]) diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index 3aadca856..f86243269 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -434,8 +434,9 @@ class U2Tester(U2Trainer): simulate_streaming=cfg.simulate_streaming) decode_time = time.time() - start_time - for i, (utt, target, result, rec_tids) in enumerate(zip( - utts, target_transcripts, result_transcripts, result_tokenids)): + for i, (utt, target, result, rec_tids) in enumerate( + zip(utts, target_transcripts, result_transcripts, + result_tokenids)): errors, len_ref = errors_func(target, result) errors_sum += errors len_refs += len_ref diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py index 18ff411b0..a6834ebc6 100644 --- a/deepspeech/frontend/featurizer/text_featurizer.py +++ b/deepspeech/frontend/featurizer/text_featurizer.py @@ -140,7 +140,7 @@ class TextFeaturizer(): Returns: str: text string. """ - tokens = [t.replace(SPACE, " ") for t in tokens ] + tokens = [t.replace(SPACE, " ") for t in tokens] return "".join(tokens) def word_tokenize(self, text): diff --git a/deepspeech/models/asr_interface.py b/deepspeech/models/asr_interface.py index eb820fc05..7dac81b4f 100644 --- a/deepspeech/models/asr_interface.py +++ b/deepspeech/models/asr_interface.py @@ -1,3 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ASR Interface module.""" import argparse @@ -72,7 +85,8 @@ class ASRInterface: :return: attention weights (B, Lmax, Tmax) :rtype: float ndarray """ - raise NotImplementedError("calculate_all_attentions method is not implemented") + raise NotImplementedError( + "calculate_all_attentions method is not implemented") def calculate_all_ctc_probs(self, xs, ilens, ys): """Calculate CTC probability. @@ -83,7 +97,8 @@ class ASRInterface: :return: CTC probabilities (B, Tmax, vocab) :rtype: float ndarray """ - raise NotImplementedError("calculate_all_ctc_probs method is not implemented") + raise NotImplementedError( + "calculate_all_ctc_probs method is not implemented") @property def attention_plot_class(self): @@ -102,8 +117,7 @@ class ASRInterface: def get_total_subsampling_factor(self): """Get total subsampling factor.""" raise NotImplementedError( - "get_total_subsampling_factor method is not implemented" - ) + "get_total_subsampling_factor method is not implemented") def encode(self, feat): """Encode feature in `beam_search` (optional). @@ -126,23 +140,22 @@ class ASRInterface: predefined_asr = { - "transformer": "deepspeech.models.u2:E2E", - "conformer": "deepspeech.models.u2:E2E", + "transformer": "deepspeech.models.u2:U2Model", + "conformer": "deepspeech.models.u2:U2Model", } -def dynamic_import_asr(module, name): + +def dynamic_import_asr(module): """Import ASR models dynamically. Args: - module (str): module_name:class_name or alias in `predefined_asr` - name (str): asr name. e.g., transformer, conformer + module (str): asr name. e.g., transformer, conformer Returns: type: ASR class """ - model_class = dynamic_import(module, predefined_asr.get(name, "")) - assert issubclass( - model_class, ASRInterface - ), f"{module} does not implement ASRInterface" + model_class = dynamic_import(module, predefined_asr) + assert issubclass(model_class, + ASRInterface), f"{module} does not implement ASRInterface" return model_class diff --git a/deepspeech/models/u2/u2.py b/deepspeech/models/u2/u2.py index fa517906f..6cd3b7751 100644 --- a/deepspeech/models/u2/u2.py +++ b/deepspeech/models/u2/u2.py @@ -28,8 +28,10 @@ from paddle import jit from paddle import nn from yacs.config import CfgNode +from deepspeech.decoders.scorers.ctc import CTCPrefixScorer from deepspeech.frontend.utility import IGNORE_ID from deepspeech.frontend.utility import load_cmvn +from deepspeech.models.asr_interface import ASRInterface from deepspeech.modules.cmvn import GlobalCMVN from deepspeech.modules.ctc import CTCDecoder from deepspeech.modules.decoder import TransformerDecoder @@ -49,8 +51,6 @@ from deepspeech.utils.tensor_utils import pad_sequence from deepspeech.utils.tensor_utils import th_accuracy from deepspeech.utils.utility import log_add from deepspeech.utils.utility import UpdateConfig -from deepspeech.models.asr_interface import ASRInterface -from deepspeech.decoders.scorers.ctc import CTCPrefixScorer __all__ = ["U2Model", "U2InferModel"] @@ -816,10 +816,10 @@ class U2BaseModel(ASRInterface, nn.Layer): class U2DecodeModel(U2BaseModel): - def scorers(self): """Scorers.""" - return dict(decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos)) + return dict( + decoder=self.decoder, ctc=CTCPrefixScorer(self.ctc, self.eos)) def encode(self, x): """Encode acoustic features. diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py index ee3572ea9..735f06dc6 100644 --- a/deepspeech/modules/decoder.py +++ b/deepspeech/modules/decoder.py @@ -12,23 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. """Decoder definition.""" +from typing import Any from typing import List from typing import Optional from typing import Tuple -from typing import Any import paddle from paddle import nn from typeguard import check_argument_types +from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface from deepspeech.modules.attention import MultiHeadedAttention from deepspeech.modules.decoder_layer import DecoderLayer from deepspeech.modules.embedding import PositionalEncoding from deepspeech.modules.mask import make_non_pad_mask -from deepspeech.modules.mask import subsequent_mask from deepspeech.modules.mask import make_xs_mask +from deepspeech.modules.mask import subsequent_mask from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward -from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface from deepspeech.utils.log import Log logger = Log(__name__).getlog() @@ -191,8 +191,8 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ys: (ylen,) x: (xlen, n_feat) """ - ys_mask = subsequent_mask(len(ys)).unsqueeze(0) # (B,L,L) - x_mask = make_xs_mask(x.unsqueeze(0)).unsqueeze(1) # (B,1,T) + ys_mask = subsequent_mask(len(ys)).unsqueeze(0) # (B,L,L) + x_mask = make_xs_mask(x.unsqueeze(0)).unsqueeze(1) # (B,1,T) if self.selfattention_layer_type != "selfattn": # TODO(karita): implement cache logging.warning( @@ -200,16 +200,14 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ) state = None logp, state = self.forward_one_step( - x.unsqueeze(0), x_mask, - ys.unsqueeze(0), ys_mask, - cache=state - ) + x.unsqueeze(0), x_mask, ys.unsqueeze(0), ys_mask, cache=state) return logp.squeeze(0), state # batch beam search API (see BatchScorerInterface) - def batch_score( - self, ys: paddle.Tensor, states: List[Any], xs: paddle.Tensor - ) -> Tuple[paddle.Tensor, List[Any]]: + def batch_score(self, + ys: paddle.Tensor, + states: List[Any], + xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]: """Score new token batch (required). Args: @@ -237,10 +235,12 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): ] # batch decoding - ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0) # (B,L,L) - xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T) - logp, states = self.forward_one_step(xs, xs_mask, ys, ys_mask, cache=batch_state) + ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0) # (B,L,L) + xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T) + logp, states = self.forward_one_step( + xs, xs_mask, ys, ys_mask, cache=batch_state) # transpose state of [layer, batch] into [batch, layer] - state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] + state_list = [[states[i][b] for i in range(n_layers)] + for b in range(n_batch)] return logp, state_list diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py index 7ae418c03..52f8e4bca 100644 --- a/deepspeech/modules/mask.py +++ b/deepspeech/modules/mask.py @@ -24,7 +24,7 @@ __all__ = [ ] -def make_xs_mask(xs:paddle.Tensor, pad_value=0.0) -> paddle.Tensor: +def make_xs_mask(xs: paddle.Tensor, pad_value=0.0) -> paddle.Tensor: """Maks mask tensor containing indices of non-padded part. Args: xs (paddle.Tensor): (B, T, D), zeros for pad. diff --git a/deepspeech/training/cli.py b/deepspeech/training/cli.py index db7076d3d..14a34cb75 100644 --- a/deepspeech/training/cli.py +++ b/deepspeech/training/cli.py @@ -64,7 +64,7 @@ def default_argument_parser(parser=None): """ if parser is None: parser = argparse.ArgumentParser() - + parser.register('action', 'extend', ExtendAction) parser.add_argument( '--conf', type=open, action=LoadFromFile, help="config file.") diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index b2ee7a1b8..2c2389203 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -126,7 +126,8 @@ class Trainer(): logger.info(f"Set seed {args.seed}") # profiler and benchmark options - if hasattr(self.args, "benchmark_batch_size") and self.args.benchmark_batch_size: + if hasattr(self.args, + "benchmark_batch_size") and self.args.benchmark_batch_size: with UpdateConfig(self.config): self.config.collator.batch_size = self.args.benchmark_batch_size self.config.training.log_interval = 1 @@ -335,8 +336,7 @@ class Trainer(): """ assert self.args.checkpoint_path infos = self.checkpoint.load_latest_parameters( - self.model, - checkpoint_path=self.args.checkpoint_path) + self.model, checkpoint_path=self.args.checkpoint_path) return infos def run_test(self): diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 724590952..5943cf1d7 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -1,8 +1,8 @@ # ASR -* s0 is for deepspeech2 offline -* s1 is for transformer/conformer/U2 -* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi +* s0 is for deepspeech2 offline +* s1 is for transformer/conformer/U2 +* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi ## Data | Data Subset | Duration in Seconds | diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md index 6e41cc370..d5df37d84 100644 --- a/examples/librispeech/s2/README.md +++ b/examples/librispeech/s2/README.md @@ -1,14 +1,14 @@ # LibriSpeech | Model | Params | Config | Augmentation| Loss | -| --- | --- | --- | --- | +| --- | --- | --- | --- | | transformer | 32.52 M | conf/transformer.yaml | spec_aug | 6.3197922706604 | -| Test Set | Decode Method | #Snt | #Wrd | Corr | Sub | Del | Ins | Err | S.Err | +| Test Set | Decode Method | #Snt | #Wrd | Corr | Sub | Del | Ins | Err | S.Err | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| test-clean | attention | 2620 | 52576 | 96.4 | 2.5 | 1.1 | 0.4 | 4.0 | 34.7 | -| test-clean | ctc_greedy_search | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 48.0 | -| test-clean | ctc_prefix_beamsearch | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 47.6 | -| test-clean | attention_rescore | 2620 | 52576 | 96.8 | 2.9 | 0.3 | 0.4 | 3.7 | 38.0 | -| test-clean | join_ctc_w/o_lm | 2620 | 52576 | 97.2 | 2.6 | 0.3 | 0.4 | 3.2 | 34.9 | +| test-clean | attention | 2620 | 52576 | 96.4 | 2.5 | 1.1 | 0.4 | 4.0 | 34.7 | +| test-clean | ctc_greedy_search | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 48.0 | +| test-clean | ctc_prefix_beamsearch | 2620 | 52576 | 95.9 | 3.7 | 0.4 | 0.5 | 4.6 | 47.6 | +| test-clean | attention_rescore | 2620 | 52576 | 96.8 | 2.9 | 0.3 | 0.4 | 3.7 | 38.0 | +| test-clean | join_ctc_w/o_lm | 2620 | 52576 | 97.2 | 2.6 | 0.3 | 0.4 | 3.2 | 34.9 | diff --git a/examples/librispeech/s2/conf/decode/decode.yaml b/examples/librispeech/s2/conf/decode/decode.yaml index 867bf6118..4c702db56 100644 --- a/examples/librispeech/s2/conf/decode/decode.yaml +++ b/examples/librispeech/s2/conf/decode/decode.yaml @@ -1,6 +1,6 @@ batchsize: 0 beam-size: 60 -ctc-weight: 0.4 +ctc-weight: 0.0 lm-weight: 0.0 maxlenratio: 0.0 minlenratio: 0.0 diff --git a/examples/librispeech/s2/local/recog.sh b/examples/librispeech/s2/local/recog.sh index 0393535b4..df3846c02 100755 --- a/examples/librispeech/s2/local/recog.sh +++ b/examples/librispeech/s2/local/recog.sh @@ -5,11 +5,14 @@ set -e expdir=exp datadir=data nj=32 +tag= +# decode config decode_config=conf/decode/decode.yaml + +# lm params lang_model=rnnlm.model.best lmexpdir=exp/train_rnnlm_pytorch_lm_transformer_cosine_batchsize32_lr1e-4_layer16_unigram5000_ngpu4/ - lmtag='nolm' recog_set="test-clean test-other dev-clean dev-other" @@ -21,18 +24,21 @@ bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" bpemodel=${bpeprefix}.model -if [ $# != 3 ];then - echo "usage: ${0} config_path dict_path ckpt_path_prefix" - exit -1 +# bin params +config_path=conf/transformer.yaml +dict=data/bpe_unigram_5000_units.txt +ckpt_prefix= + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +if [ -z ${ckpt_prefix} ]; then + echo "usage: $0 --ckpt_prefix ckpt_prefix" + exit 1 fi ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." -config_path=$1 -dict=$2 -ckpt_prefix=$3 - ckpt_dir=$(dirname `dirname ${ckpt_prefix}`) echo "ckpt dir: ${ckpt_dir}" @@ -61,7 +67,7 @@ for dmethd in join_ctc; do for rtask in ${recog_set}; do ( echo "${rtask} dataset" - decode_dir=${ckpt_dir}/decode/decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag}_${ckpt_tag} + decode_dir=${ckpt_dir}/decode/decode_${rtask/-/_}_${dmethd}_$(basename ${config_path%.*})_${lmtag}_${ckpt_tag}_${tag} feat_recog_dir=${datadir} mkdir -p ${decode_dir} mkdir -p ${feat_recog_dir} diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/s2/local/test.sh index 006a13c5f..5f662d292 100755 --- a/examples/librispeech/s2/local/test.sh +++ b/examples/librispeech/s2/local/test.sh @@ -17,19 +17,20 @@ bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" bpemodel=${bpeprefix}.model -if [ $# != 3 ];then - echo "usage: ${0} config_path dict_path ckpt_path_prefix" - exit -1 +config_path=conf/transformer.yaml +dict=data/bpe_unigram_5000_units.txt +ckpt_prefix= + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +if [ -z ${ckpt_prefix} ]; then + echo "usage: $0 --ckpt_prefix ckpt_prefix" + exit 1 fi ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." -config_path=$1 -dict=$2 -ckpt_prefix=$3 - - ckpt_dir=$(dirname `dirname ${ckpt_prefix}`) echo "ckpt dir: ${ckpt_dir}" diff --git a/requirements.txt b/requirements.txt index 42fd645a5..a7310a024 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,43 +1,43 @@ +ConfigArgParse coverage editdistance +g2p_en +g2pM gpustat +h5py +inflect +jieba jsonlines kaldiio +librosa +llvmlite loguru +matplotlib +nltk +numba +numpy==1.20.0 +pandas +phkit Pillow +praatio~=4.1 pre-commit pybind11 +pypinyin +pyworld resampy==0.2.2 sacrebleu scipy==1.2.1 sentencepiece snakeviz +soundfile~=0.10 sox tensorboardX textgrid +timer tqdm typeguard -visualdl==2.2.0 -yacs -numpy==1.20.0 -numba -nltk -inflect -librosa unidecode -llvmlite -matplotlib -pandas -soundfile~=0.10 -g2p_en -pypinyin +visualdl==2.2.0 webrtcvad -g2pM -praatio~=4.1 -h5py -timer -pyworld -jieba -phkit +yacs yq -ConfigArgParse \ No newline at end of file diff --git a/setup.py b/setup.py index 07b6aac04..bd982129c 100644 --- a/setup.py +++ b/setup.py @@ -11,20 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import contextlib +import inspect import io import os import re +import subprocess as sp import sys from pathlib import Path -import contextlib -import inspect +from setuptools import Command from setuptools import find_packages from setuptools import setup -from setuptools import Command from setuptools.command.develop import develop from setuptools.command.install import install -import subprocess as sp HERE = Path(os.path.abspath(os.path.dirname(__file__))) @@ -40,16 +40,18 @@ def pushd(new_dir): def read(*names, **kwargs): - with io.open(os.path.join(os.path.dirname(__file__), *names), - encoding=kwargs.get("encoding", "utf8")) as fp: + with io.open( + os.path.join(os.path.dirname(__file__), *names), + encoding=kwargs.get("encoding", "utf8")) as fp: return fp.read() def check_call(cmd: str, shell=False, executable=None): try: - sp.check_call(cmd.split(), - shell=shell, - executable="/bin/bash" if shell else executable) + sp.check_call( + cmd.split(), + shell=shell, + executable="/bin/bash" if shell else executable) except sp.CalledProcessError as e: print( f"{__file__}:{inspect.currentframe().f_lineno}: CMD: {cmd}, Error:", @@ -189,7 +191,6 @@ setup_info = dict( 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', - ], -) + ], ) setup(**setup_info)