diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index c1a35560..1ca6a4fe 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -162,10 +162,7 @@ class U2BaseModel(nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - #TODO(Hui Zhang): sum not support bool type - #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] - encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( - 1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] # 2a. Attention-decoder branch loss_att = None @@ -320,8 +317,7 @@ class U2BaseModel(nn.Layer): # 2. Decoder forward step by step for i in range(1, maxlen + 1): # Stop if all batch and all beam produce eos - # TODO(Hui Zhang): if end_flag.sum() == running_size: - if end_flag.cast(paddle.int64).sum() == running_size: + if end_flag.sum() == running_size: break # 2.1 Forward decoder step @@ -407,9 +403,7 @@ class U2BaseModel(nn.Layer): speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) maxlen = encoder_out.size(1) - # (TODO Hui Zhang): bool no support reduce_sum - # encoder_out_lens = encoder_mask.squeeze(1).sum(1) - encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1) + encoder_out_lens = encoder_mask.squeeze(1).sum(1) ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) diff --git a/deepspeech/models/u2_st.py b/deepspeech/models/u2_st.py index b725cc35..531fafd0 100644 --- a/deepspeech/models/u2_st.py +++ b/deepspeech/models/u2_st.py @@ -163,10 +163,7 @@ class U2STBaseModel(nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - #TODO(Hui Zhang): sum not support bool type - #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] - encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( - 1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] # 2a. ST-decoder branch start = time.time() @@ -363,8 +360,7 @@ class U2STBaseModel(nn.Layer): # 2. Decoder forward step by step for i in range(1, maxlen + 1): # Stop if all batch and all beam produce eos - # TODO(Hui Zhang): if end_flag.sum() == running_size: - if end_flag.cast(paddle.int64).sum() == running_size: + if end_flag.sum() == running_size: break # 2.1 Forward decoder step diff --git a/deepspeech/modules/attention.py b/deepspeech/modules/attention.py index 4401a4a5..1a984dd4 100644 --- a/deepspeech/modules/attention.py +++ b/deepspeech/modules/attention.py @@ -109,8 +109,8 @@ class MultiHeadedAttention(nn.Layer): p_attn = self.dropout(attn) x = paddle.matmul(p_attn, value) # (batch, head, time1, d_k) - x = x.transpose([0, 2, 1, 3]).contiguous().view( - n_batch, -1, self.h * self.d_k) # (batch, time1, d_model) + x = x.transpose([0, 2, 1, 3]).view(n_batch, -1, self.h * + self.d_k) # (batch, time1, d_model) return self.linear_out(x) # (batch, time1, d_model) diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py index 87c9fa49..143f6cc5 100644 --- a/deepspeech/modules/decoder.py +++ b/deepspeech/modules/decoder.py @@ -124,9 +124,7 @@ class TransformerDecoder(nn.Layer): # m: (1, L, L) m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0) # tgt_mask: (B, L, L) - # TODO(Hui Zhang): not support & for tensor - # tgt_mask = tgt_mask & m - tgt_mask = tgt_mask.logical_and(m) + tgt_mask = tgt_mask & m x, _ = self.embed(tgt) for layer in self.decoders: @@ -137,9 +135,7 @@ class TransformerDecoder(nn.Layer): if self.use_output_layer: x = self.output_layer(x) - # TODO(Hui Zhang): reduce_sum not support bool type - # olens = tgt_mask.sum(1) - olens = tgt_mask.astype(paddle.int).sum(1) + olens = tgt_mask.sum(1) return x, olens def forward_one_step( diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py index 71ec61a0..fb44fe29 100644 --- a/deepspeech/modules/encoder.py +++ b/deepspeech/modules/encoder.py @@ -162,8 +162,7 @@ class BaseEncoder(nn.Layer): xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0) #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor masks = masks.astype(paddle.bool) - #TODO(Hui Zhang): mask_pad = ~masks - mask_pad = masks.logical_not() + mask_pad = ~masks chunk_masks = add_optional_chunk_mask( xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, decoding_chunk_size, self.static_chunk_size, diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index 8918ca66..f692a818 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -124,9 +124,9 @@ class LabelSmoothingLoss(nn.Layer): # use zeros_like instead of torch.no_grad() for true_dist, # since no_grad() can not be exported by JIT true_dist = paddle.full_like(x, self.smoothing / (self.size - 1)) - ignore = target == self.padding_idx # (B,) + ignore = (target == self.padding_idx) # (B,) - # target = target * (1 - ignore) # avoid -1 index + #TODO(Hui Zhang): target = target * (1 - ignore) # avoid -1 index target = target.masked_fill(ignore, 0) # avoid -1 index # true_dist.scatter_(1, target.unsqueeze(1), self.confidence) target_mask = F.one_hot(target, self.size) @@ -135,10 +135,8 @@ class LabelSmoothingLoss(nn.Layer): kl = self.criterion(F.log_softmax(x, axis=1), true_dist) - #TODO(Hui Zhang): sum not support bool type - #total = len(target) - int(ignore.sum()) - total = len(target) - int(ignore.type_as(target).sum()) + total = len(target) - int(ignore.sum()) denom = total if self.normalize_length else B - #numer = (kl * (1 - ignore)).sum() + #TODO(Hui Zhang): numer = (kl * (1 - ignore)).sum() numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum() return numer / denom diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py index 05e86eb3..6d46f5ba 100644 --- a/deepspeech/modules/mask.py +++ b/deepspeech/modules/mask.py @@ -69,8 +69,7 @@ def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: [1, 1, 1, 0, 0], [1, 1, 0, 0, 0]] """ - #TODO(Hui Zhang): return ~make_pad_mask(lengths), not support ~ - return make_pad_mask(lengths).logical_not() + return ~make_pad_mask(lengths) def subsequent_mask(size: int) -> paddle.Tensor: @@ -92,12 +91,7 @@ def subsequent_mask(size: int) -> paddle.Tensor: [1, 1, 1]] """ ret = paddle.ones([size, size], dtype=paddle.bool) - #TODO(Hui Zhang): tril not support bool - #return paddle.tril(ret) - ret = ret.astype(paddle.float) - ret = paddle.tril(ret) - ret = ret.astype(paddle.bool) - return ret + return paddle.tril(ret) def subsequent_chunk_mask( @@ -186,15 +180,13 @@ def add_optional_chunk_mask(xs: paddle.Tensor, chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size, num_left_chunks) # (L, L) chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - # chunk_masks = masks & chunk_masks # (B, L, L) - chunk_masks = masks.logical_and(chunk_masks) # (B, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) elif static_chunk_size > 0: num_left_chunks = num_decoding_left_chunks chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size, num_left_chunks) # (L, L) chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) - # chunk_masks = masks & chunk_masks # (B, L, L) - chunk_masks = masks.logical_and(chunk_masks) # (B, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) else: chunk_masks = masks return chunk_masks diff --git a/deepspeech/utils/tensor_utils.py b/deepspeech/utils/tensor_utils.py index 9bff6b0f..3519f4fa 100644 --- a/deepspeech/utils/tensor_utils.py +++ b/deepspeech/utils/tensor_utils.py @@ -168,13 +168,7 @@ def th_accuracy(pad_outputs: paddle.Tensor, pad_pred = pad_outputs.view( pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2) mask = pad_targets != ignore_label - #TODO(Hui Zhang): sum not support bool type - # numerator = paddle.sum( - # pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - numerator = ( + numerator = paddle.sum( pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - numerator = paddle.sum(numerator.type_as(pad_targets)) - #TODO(Hui Zhang): sum not support bool type - # denominator = paddle.sum(mask) - denominator = paddle.sum(mask.type_as(pad_targets)) + denominator = paddle.sum(mask) return float(numerator) / float(denominator) diff --git a/doc/src/reference.md b/doc/src/reference.md index 69ff6ab8..341e1361 100644 --- a/doc/src/reference.md +++ b/doc/src/reference.md @@ -1,3 +1,6 @@ # Reference +* [delta](https://github.com/Delta-ML/delta.git) +* [espnet](https://github.com/espnet/espnet.git) +* [kaldi](https://github.com/kaldi-asr/kaldi.git) * [wenet](https://github.com/mobvoi/wenet) diff --git a/tests/mask_test.py b/tests/mask_test.py index f44aca8f..dbe8c4b0 100644 --- a/tests/mask_test.py +++ b/tests/mask_test.py @@ -37,13 +37,13 @@ class TestU2Model(unittest.TestCase): def test_make_non_pad_mask(self): res = make_non_pad_mask(self.lengths) - res2 = make_pad_mask(self.lengths).logical_not() + res2 = ~make_pad_mask(self.lengths) self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist()) self.assertSequenceEqual(res.numpy().tolist(), res2.numpy().tolist()) def test_make_pad_mask(self): res = make_pad_mask(self.lengths) - res1 = make_non_pad_mask(self.lengths).logical_not() + res1 = ~make_non_pad_mask(self.lengths) self.assertSequenceEqual(res.numpy().tolist(), self.pad_masks.tolist()) self.assertSequenceEqual(res.numpy().tolist(), res1.tolist())