diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 207e470a..c25c2186 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -124,10 +124,7 @@ class U2BaseModel(ASRInterface, nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - #TODO(Hui Zhang): sum not support bool type - #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] - encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( - 1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] # 2a. Attention-decoder branch loss_att = None @@ -291,8 +288,7 @@ class U2BaseModel(ASRInterface, nn.Layer): # 2. Decoder forward step by step for i in range(1, maxlen + 1): # Stop if all batch and all beam produce eos - # TODO(Hui Zhang): if end_flag.sum() == running_size: - if end_flag.cast(paddle.int64).sum() == running_size: + if end_flag.sum() == running_size: break # 2.1 Forward decoder step @@ -378,9 +374,7 @@ class U2BaseModel(ASRInterface, nn.Layer): speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) maxlen = encoder_out.shape[1] - # (TODO Hui Zhang): bool no support reduce_sum - # encoder_out_lens = encoder_mask.squeeze(1).sum(1) - encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1) + encoder_out_lens = encoder_mask.squeeze(1).sum(1) ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index e8b61bc0..31defbba 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -111,10 +111,7 @@ class U2STBaseModel(nn.Layer): encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_time = time.time() - start #logger.debug(f"encoder time: {encoder_time}") - #TODO(Hui Zhang): sum not support bool type - #encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] - encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum( - 1) #[B, 1, T] -> [B] + encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B] # 2a. ST-decoder branch start = time.time() diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 03b637b7..5e1b4c92 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -140,9 +140,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): # m: (1, L, L) m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0) # tgt_mask: (B, L, L) - # TODO(Hui Zhang): not support & for tensor - # tgt_mask = tgt_mask & m - tgt_mask = tgt_mask.logical_and(m) + tgt_mask = tgt_mask & m x, _ = self.embed(tgt) for layer in self.decoders: @@ -153,9 +151,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): if self.use_output_layer: x = self.output_layer(x) - # TODO(Hui Zhang): reduce_sum not support bool type - # olens = tgt_mask.sum(1) - olens = tgt_mask.astype(paddle.int).sum(1) + olens = tgt_mask.sum(1) return x, paddle.to_tensor(0.0), olens def forward_one_step( diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 2e76ccb0..db584884 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -164,12 +164,8 @@ class BaseEncoder(nn.Layer): if self.global_cmvn is not None: xs = self.global_cmvn(xs) - #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor - xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) - #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor - masks = masks.astype(paddle.bool) - #TODO(Hui Zhang): mask_pad = ~masks - mask_pad = masks.logical_not() + xs, pos_emb, masks = self.embed(xs, masks, offset=0) + mask_pad = ~masks chunk_masks = add_optional_chunk_mask( xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, decoding_chunk_size, self.static_chunk_size, @@ -215,11 +211,8 @@ class BaseEncoder(nn.Layer): same shape as the original cnn_cache """ assert xs.shape[0] == 1 # batch size must be one - # tmp_masks is just for interface compatibility - # TODO(Hui Zhang): stride_slice not support bool tensor - # tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool) - tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32) - tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T] + # tmp_masks is just for interface compatibility, [B=1, C=1, T] + tmp_masks = paddle.ones([1, 1, xs.shape[1]], dtype=paddle.bool) if self.global_cmvn is not None: xs = self.global_cmvn(xs) @@ -228,9 +221,8 @@ class BaseEncoder(nn.Layer): xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) # after embed, xs=(B=1, chunk_size, hidden-dim) - elayers = paddle.shape(att_cache)[0] - cache_t1 = paddle.shape(att_cache)[2] - chunk_size = paddle.shape(xs)[1] + elayers, _, cache_t1, _ = att_cache.shape + chunk_size = xs.shape[1] attention_key_size = cache_t1 + chunk_size # only used when using `RelPositionMultiHeadedAttention` @@ -402,11 +394,7 @@ class TransformerEncoder(BaseEncoder): if self.global_cmvn is not None: xs = self.global_cmvn(xs) - #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor - xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) - #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor - masks = masks.astype(paddle.bool) - + xs, pos_emb, masks = self.embed(xs, masks, offset=0) if cache is None: cache = [None for _ in range(len(self.encoders))] new_cache = [] diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index 1f66c015..787a0652 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -109,13 +109,8 @@ def subsequent_mask(size: int) -> paddle.Tensor: [1, 1, 1]] """ ret = paddle.ones([size, size], dtype=paddle.bool) - #TODO(Hui Zhang): tril not support bool - #return paddle.tril(ret) - ret = ret.astype(paddle.float) - ret = paddle.tril(ret) - ret = ret.astype(paddle.bool) - return ret - + return paddle.tril(ret) + def subsequent_chunk_mask( size: int, diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py index 422d4f82..3ac102f3 100644 --- a/paddlespeech/s2t/utils/tensor_utils.py +++ b/paddlespeech/s2t/utils/tensor_utils.py @@ -184,13 +184,8 @@ def th_accuracy(pad_outputs: paddle.Tensor, pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1], pad_outputs.shape[1]).argmax(2) mask = pad_targets != ignore_label - #TODO(Hui Zhang): sum not support bool type - # numerator = paddle.sum( - # pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - numerator = ( + + numerator = paddle.sum( pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) - numerator = paddle.sum(numerator.type_as(pad_targets)) - #TODO(Hui Zhang): sum not support bool type - # denominator = paddle.sum(mask) - denominator = paddle.sum(mask.type_as(pad_targets)) + denominator = paddle.sum(mask) return float(numerator) / float(denominator)