From e4a9328c4036b2adab1dd330613d47a0a0dd872d Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Oct 2021 05:20:59 +0000 Subject: [PATCH 1/2] fix some bug and complete the recog.py --- deepspeech/decoders/recog.py | 19 +++++++++++++------ deepspeech/models/lm/transformer.py | 22 ++++++++++++---------- deepspeech/modules/encoder.py | 2 +- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/deepspeech/decoders/recog.py b/deepspeech/decoders/recog.py index c8df65d68..450aaa5d6 100644 --- a/deepspeech/decoders/recog.py +++ b/deepspeech/decoders/recog.py @@ -29,6 +29,7 @@ from deepspeech.exps import dynamic_import_tester from deepspeech.io.reader import LoadInputsAndTargets from deepspeech.models.asr_interface import ASRInterface from deepspeech.utils.log import Log +from deepspeech.models.lm.transformer import TransformerLM # from espnet.asr.asr_utils import get_model_conf # from espnet.asr.asr_utils import torch_load # from espnet.nets.lm_interface import dynamic_import_lm @@ -83,12 +84,18 @@ def recog_v2(args): ) if args.rnnlm: - lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) - # NOTE: for a compatibility with less than 0.5.0 version models - lm_model_module = getattr(lm_args, "model_module", "default") - lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) - lm = lm_class(len(char_list), lm_args) - torch_load(args.rnnlm, lm) + lm_path = args.rnnlm + lm = TransformerLM( + n_vocab=5002, + pos_enc=None, + embed_unit=128, + att_unit=512, + head=8, + unit=2048, + layer=16, + dropout_rate=0.5, ) + model_dict = paddle.load(lm_path) + lm.set_state_dict(model_dict) lm.eval() else: lm = None diff --git a/deepspeech/models/lm/transformer.py b/deepspeech/models/lm/transformer.py index dcae4ea0f..a6b5811ca 100644 --- a/deepspeech/models/lm/transformer.py +++ b/deepspeech/models/lm/transformer.py @@ -23,9 +23,9 @@ import paddle.nn.functional as F from deepspeech.modules.mask import subsequent_mask from deepspeech.modules.encoder import TransformerEncoder from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface -from deepspeech.models.lm_interface import -#LMInterface +from deepspeech.models.lm_interface import LMInterface +import logging class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): def __init__( self, @@ -36,7 +36,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): head: int=2, unit: int=1024, layer: int=4, - dropout_rate: float=0.5, + dropout_rate: float=0.5, emb_dropout_rate: float = 0.0, att_dropout_rate: float = 0.0, tie_weights: bool = False,): @@ -84,6 +84,8 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): ), "Tie Weights: True need embedding and final dimensions to match" self.decoder.weight = self.embed.weight + + def _target_mask(self, ys_in_pad): ys_mask = ys_in_pad != 0 m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0) @@ -151,7 +153,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): emb, self._target_mask(y), cache=state ) h = self.decoder(h[:, -1]) - logp = h.log_softmax(axis=-1).squeeze(0) + logp = F.log_softmax(h).squeeze(0) return logp, cache # batch beam search API (see BatchScorerInterface) @@ -194,7 +196,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): emb, self._target_mask(ys), cache=batch_state ) h = self.decoder(h[:, -1]) - logp = h.log_softmax(axi=-1) + logp = F.log_softmax(h) # transpose state of [layer, batch] into [batch, layer] state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] @@ -219,7 +221,7 @@ if __name__ == "__main__": # head: int=2, # unit: int=1024, # layer: int=4, - # dropout_rate: float=0.5, + # dropout_rate: float=0.5, # emb_dropout_rate: float = 0.0, # att_dropout_rate: float = 0.0, # tie_weights: bool = False,): @@ -231,14 +233,14 @@ if __name__ == "__main__": #Test the score input2 = np.array([5]) input2 = paddle.to_tensor(input2) - state = (None, None, 0) + state = None output, state = tlm.score(input2, state, None) - input3 = np.array([10]) + input3 = np.array([5,10]) input3 = paddle.to_tensor(input3) output, state = tlm.score(input3, state, None) - input4 = np.array([0]) + input4 = np.array([5,10,0]) input4 = paddle.to_tensor(input4) output, state = tlm.score(input4, state, None) print("output", output) @@ -256,4 +258,4 @@ if __name__ == "__main__": print("output", output) #print("cache", cache) #np.save("output_pd.npy", output) - """ \ No newline at end of file + """ diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py index 0f8f10751..518f2bbb6 100644 --- a/deepspeech/modules/encoder.py +++ b/deepspeech/modules/encoder.py @@ -399,7 +399,7 @@ class TransformerEncoder(BaseEncoder): #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) else: - xs = self.embed(xs) + xs , pos_emb, masks= self.embed(xs, masks.astype(xs.dtype), offset=0) #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor masks = masks.astype(paddle.bool) From 418d85ef73865f3c30cb0e65cbadfec220befffb Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 25 Oct 2021 05:24:22 +0000 Subject: [PATCH 2/2] fix some bug and complete the recog.py --- deepspeech/decoders/recog.py | 23 +++++----- deepspeech/models/lm/transformer.py | 65 ++++++++++++++--------------- deepspeech/modules/encoder.py | 11 ++--- 3 files changed, 48 insertions(+), 51 deletions(-) diff --git a/deepspeech/decoders/recog.py b/deepspeech/decoders/recog.py index 450aaa5d6..6868bc00b 100644 --- a/deepspeech/decoders/recog.py +++ b/deepspeech/decoders/recog.py @@ -28,8 +28,8 @@ from .utils import add_results_to_json from deepspeech.exps import dynamic_import_tester from deepspeech.io.reader import LoadInputsAndTargets from deepspeech.models.asr_interface import ASRInterface -from deepspeech.utils.log import Log from deepspeech.models.lm.transformer import TransformerLM +from deepspeech.utils.log import Log # from espnet.asr.asr_utils import get_model_conf # from espnet.asr.asr_utils import torch_load # from espnet.nets.lm_interface import dynamic_import_lm @@ -80,8 +80,7 @@ def recog_v2(args): sort_in_input_length=False, preprocess_conf=confs.collator.augmentation_config if args.preprocess_conf is None else args.preprocess_conf, - preprocess_args={"train": False}, - ) + preprocess_args={"train": False}, ) if args.rnnlm: lm_path = args.rnnlm @@ -120,8 +119,7 @@ def recog_v2(args): ctc=args.ctc_weight, lm=args.lm_weight, ngram=args.ngram_weight, - length_bonus=args.penalty, - ) + length_bonus=args.penalty, ) beam_search = BeamSearch( beam_size=args.beam_size, vocab_size=len(char_list), @@ -130,8 +128,7 @@ def recog_v2(args): sos=model.sos, eos=model.eos, token_list=char_list, - pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", - ) + pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if args.batchsize == 1: @@ -178,9 +175,10 @@ def recog_v2(args): logger.info(f'feat: {feat.shape}') enc = model.encode(paddle.to_tensor(feat).to(dtype)) logger.info(f'eout: {enc.shape}') - nbest_hyps = beam_search(x=enc, - maxlenratio=args.maxlenratio, - minlenratio=args.minlenratio) + nbest_hyps = beam_search( + x=enc, + maxlenratio=args.maxlenratio, + minlenratio=args.minlenratio) nbest_hyps = [ h.asdict() for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)] @@ -190,9 +188,8 @@ def recog_v2(args): item = new_js[name]['output'][0] # 1-best ref = item['text'] - rec_text = item['rec_text'].replace('▁', - ' ').replace('', - '').strip() + rec_text = item['rec_text'].replace('▁', ' ').replace( + '', '').strip() rec_tokenid = list(map(int, item['rec_tokenid'].split())) f.write({ "utt": name, diff --git a/deepspeech/models/lm/transformer.py b/deepspeech/models/lm/transformer.py index a6b5811ca..3f5a76c52 100644 --- a/deepspeech/models/lm/transformer.py +++ b/deepspeech/models/lm/transformer.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging from typing import Any from typing import List from typing import Tuple @@ -20,12 +21,12 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F -from deepspeech.modules.mask import subsequent_mask -from deepspeech.modules.encoder import TransformerEncoder from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface from deepspeech.models.lm_interface import LMInterface +from deepspeech.modules.encoder import TransformerEncoder +from deepspeech.modules.mask import subsequent_mask + -import logging class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): def __init__( self, @@ -37,9 +38,9 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): unit: int=1024, layer: int=4, dropout_rate: float=0.5, - emb_dropout_rate: float = 0.0, - att_dropout_rate: float = 0.0, - tie_weights: bool = False,): + emb_dropout_rate: float=0.0, + att_dropout_rate: float=0.0, + tie_weights: bool=False, ): nn.Layer.__init__(self) if pos_enc == "sinusoidal": @@ -84,16 +85,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): ), "Tie Weights: True need embedding and final dimensions to match" self.decoder.weight = self.embed.weight - - def _target_mask(self, ys_in_pad): ys_mask = ys_in_pad != 0 m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0) return ys_mask.unsqueeze(-2) & m - def forward( - self, x: paddle.Tensor, t: paddle.Tensor - ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + def forward(self, x: paddle.Tensor, t: paddle.Tensor + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute LM loss value from buffer sequences. Args: @@ -119,7 +117,8 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): emb = self.embed(x) h, _ = self.encoder(emb, xlen) y = self.decoder(h) - loss = F.cross_entropy(y.view(-1, y.shape[-1]), t.view(-1), reduction="none") + loss = F.cross_entropy( + y.view(-1, y.shape[-1]), t.view(-1), reduction="none") mask = xm.to(dtype=loss.dtype) logp = loss * mask.view(-1) logp = logp.sum() @@ -150,16 +149,16 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): emb = self.embed(y) h, _, cache = self.encoder.forward_one_step( - emb, self._target_mask(y), cache=state - ) + emb, self._target_mask(y), cache=state) h = self.decoder(h[:, -1]) logp = F.log_softmax(h).squeeze(0) return logp, cache # batch beam search API (see BatchScorerInterface) - def batch_score( - self, ys: paddle.Tensor, states: List[Any], xs: paddle.Tensor - ) -> Tuple[paddle.Tensor, List[Any]]: + def batch_score(self, + ys: paddle.Tensor, + states: List[Any], + xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]: """Score new token batch (required). Args: @@ -193,13 +192,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): # batch decoding h, _, states = self.encoder.forward_one_step( - emb, self._target_mask(ys), cache=batch_state - ) + emb, self._target_mask(ys), cache=batch_state) h = self.decoder(h[:, -1]) logp = F.log_softmax(h) # transpose state of [layer, batch] into [batch, layer] - state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] + state_list = [[states[i][b] for i in range(n_layers)] + for b in range(n_batch)] return logp, state_list @@ -214,17 +213,17 @@ if __name__ == "__main__": layer=16, dropout_rate=0.5, ) - # n_vocab: int, - # pos_enc: str=None, - # embed_unit: int=128, - # att_unit: int=256, - # head: int=2, - # unit: int=1024, - # layer: int=4, - # dropout_rate: float=0.5, - # emb_dropout_rate: float = 0.0, - # att_dropout_rate: float = 0.0, - # tie_weights: bool = False,): + # n_vocab: int, + # pos_enc: str=None, + # embed_unit: int=128, + # att_unit: int=256, + # head: int=2, + # unit: int=1024, + # layer: int=4, + # dropout_rate: float=0.5, + # emb_dropout_rate: float = 0.0, + # att_dropout_rate: float = 0.0, + # tie_weights: bool = False,): paddle.set_device("cpu") model_dict = paddle.load("transformerLM.pdparams") tlm.set_state_dict(model_dict) @@ -236,11 +235,11 @@ if __name__ == "__main__": state = None output, state = tlm.score(input2, state, None) - input3 = np.array([5,10]) + input3 = np.array([5, 10]) input3 = paddle.to_tensor(input3) output, state = tlm.score(input3, state, None) - input4 = np.array([5,10,0]) + input4 = np.array([5, 10, 0]) input4 = paddle.to_tensor(input4) output, state = tlm.score(input4, state, None) print("output", output) diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py index 518f2bbb6..6288e2ee5 100644 --- a/deepspeech/modules/encoder.py +++ b/deepspeech/modules/encoder.py @@ -24,9 +24,9 @@ from deepspeech.modules.activation import get_activation from deepspeech.modules.attention import MultiHeadedAttention from deepspeech.modules.attention import RelPositionMultiHeadedAttention from deepspeech.modules.conformer_convolution import ConvolutionModule +from deepspeech.modules.embedding import NoPositionalEncoding from deepspeech.modules.embedding import PositionalEncoding from deepspeech.modules.embedding import RelPositionalEncoding -from deepspeech.modules.embedding import NoPositionalEncoding from deepspeech.modules.encoder_layer import ConformerEncoderLayer from deepspeech.modules.encoder_layer import TransformerEncoderLayer from deepspeech.modules.mask import add_optional_chunk_mask @@ -378,8 +378,7 @@ class TransformerEncoder(BaseEncoder): self, xs: paddle.Tensor, masks: paddle.Tensor, - cache=None, - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + cache=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Encode input frame. Args: @@ -397,9 +396,11 @@ class TransformerEncoder(BaseEncoder): if isinstance(self.embed, Conv2dSubsampling): #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor - xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) + xs, pos_emb, masks = self.embed( + xs, masks.astype(xs.dtype), offset=0) else: - xs , pos_emb, masks= self.embed(xs, masks.astype(xs.dtype), offset=0) + xs, pos_emb, masks = self.embed( + xs, masks.astype(xs.dtype), offset=0) #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor masks = masks.astype(paddle.bool)