elimiete cast dtype for bool op

pull/2425/head
Hui Zhang 2 years ago
parent 8e7a315e00
commit 6de81d74d9

@ -124,10 +124,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
encoder_time = time.time() - start encoder_time = time.time() - start
#logger.debug(f"encoder time: {encoder_time}") #logger.debug(f"encoder time: {encoder_time}")
#TODO(Hui Zhang): sum not support bool type encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
#encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
1) #[B, 1, T] -> [B]
# 2a. Attention-decoder branch # 2a. Attention-decoder branch
loss_att = None loss_att = None
@ -291,8 +288,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
# 2. Decoder forward step by step # 2. Decoder forward step by step
for i in range(1, maxlen + 1): for i in range(1, maxlen + 1):
# Stop if all batch and all beam produce eos # Stop if all batch and all beam produce eos
# TODO(Hui Zhang): if end_flag.sum() == running_size: if end_flag.sum() == running_size:
if end_flag.cast(paddle.int64).sum() == running_size:
break break
# 2.1 Forward decoder step # 2.1 Forward decoder step
@ -378,9 +374,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
speech, speech_lengths, decoding_chunk_size, speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks, simulate_streaming) num_decoding_left_chunks, simulate_streaming)
maxlen = encoder_out.shape[1] maxlen = encoder_out.shape[1]
# (TODO Hui Zhang): bool no support reduce_sum encoder_out_lens = encoder_mask.squeeze(1).sum(1)
# encoder_out_lens = encoder_mask.squeeze(1).sum(1)
encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1)
ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size) ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size)
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)

@ -111,10 +111,7 @@ class U2STBaseModel(nn.Layer):
encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
encoder_time = time.time() - start encoder_time = time.time() - start
#logger.debug(f"encoder time: {encoder_time}") #logger.debug(f"encoder time: {encoder_time}")
#TODO(Hui Zhang): sum not support bool type encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
#encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
1) #[B, 1, T] -> [B]
# 2a. ST-decoder branch # 2a. ST-decoder branch
start = time.time() start = time.time()

@ -140,9 +140,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
# m: (1, L, L) # m: (1, L, L)
m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0) m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0)
# tgt_mask: (B, L, L) # tgt_mask: (B, L, L)
# TODO(Hui Zhang): not support & for tensor tgt_mask = tgt_mask & m
# tgt_mask = tgt_mask & m
tgt_mask = tgt_mask.logical_and(m)
x, _ = self.embed(tgt) x, _ = self.embed(tgt)
for layer in self.decoders: for layer in self.decoders:
@ -153,9 +151,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
if self.use_output_layer: if self.use_output_layer:
x = self.output_layer(x) x = self.output_layer(x)
# TODO(Hui Zhang): reduce_sum not support bool type olens = tgt_mask.sum(1)
# olens = tgt_mask.sum(1)
olens = tgt_mask.astype(paddle.int).sum(1)
return x, paddle.to_tensor(0.0), olens return x, paddle.to_tensor(0.0), olens
def forward_one_step( def forward_one_step(

@ -164,12 +164,8 @@ class BaseEncoder(nn.Layer):
if self.global_cmvn is not None: if self.global_cmvn is not None:
xs = self.global_cmvn(xs) xs = self.global_cmvn(xs)
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor xs, pos_emb, masks = self.embed(xs, masks, offset=0)
xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) mask_pad = ~masks
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks = masks.astype(paddle.bool)
#TODO(Hui Zhang): mask_pad = ~masks
mask_pad = masks.logical_not()
chunk_masks = add_optional_chunk_mask( chunk_masks = add_optional_chunk_mask(
xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
decoding_chunk_size, self.static_chunk_size, decoding_chunk_size, self.static_chunk_size,
@ -215,11 +211,8 @@ class BaseEncoder(nn.Layer):
same shape as the original cnn_cache same shape as the original cnn_cache
""" """
assert xs.shape[0] == 1 # batch size must be one assert xs.shape[0] == 1 # batch size must be one
# tmp_masks is just for interface compatibility # tmp_masks is just for interface compatibility, [B=1, C=1, T]
# TODO(Hui Zhang): stride_slice not support bool tensor tmp_masks = paddle.ones([1, 1, xs.shape[1]], dtype=paddle.bool)
# tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool)
tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T]
if self.global_cmvn is not None: if self.global_cmvn is not None:
xs = self.global_cmvn(xs) xs = self.global_cmvn(xs)
@ -228,9 +221,8 @@ class BaseEncoder(nn.Layer):
xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
# after embed, xs=(B=1, chunk_size, hidden-dim) # after embed, xs=(B=1, chunk_size, hidden-dim)
elayers = paddle.shape(att_cache)[0] elayers, _, cache_t1, _ = att_cache.shape
cache_t1 = paddle.shape(att_cache)[2] chunk_size = xs.shape[1]
chunk_size = paddle.shape(xs)[1]
attention_key_size = cache_t1 + chunk_size attention_key_size = cache_t1 + chunk_size
# only used when using `RelPositionMultiHeadedAttention` # only used when using `RelPositionMultiHeadedAttention`
@ -402,11 +394,7 @@ class TransformerEncoder(BaseEncoder):
if self.global_cmvn is not None: if self.global_cmvn is not None:
xs = self.global_cmvn(xs) xs = self.global_cmvn(xs)
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor xs, pos_emb, masks = self.embed(xs, masks, offset=0)
xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks = masks.astype(paddle.bool)
if cache is None: if cache is None:
cache = [None for _ in range(len(self.encoders))] cache = [None for _ in range(len(self.encoders))]
new_cache = [] new_cache = []

@ -109,13 +109,8 @@ def subsequent_mask(size: int) -> paddle.Tensor:
[1, 1, 1]] [1, 1, 1]]
""" """
ret = paddle.ones([size, size], dtype=paddle.bool) ret = paddle.ones([size, size], dtype=paddle.bool)
#TODO(Hui Zhang): tril not support bool return paddle.tril(ret)
#return paddle.tril(ret)
ret = ret.astype(paddle.float)
ret = paddle.tril(ret)
ret = ret.astype(paddle.bool)
return ret
def subsequent_chunk_mask( def subsequent_chunk_mask(
size: int, size: int,

@ -184,13 +184,8 @@ def th_accuracy(pad_outputs: paddle.Tensor,
pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1], pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
pad_outputs.shape[1]).argmax(2) pad_outputs.shape[1]).argmax(2)
mask = pad_targets != ignore_label mask = pad_targets != ignore_label
#TODO(Hui Zhang): sum not support bool type
# numerator = paddle.sum( numerator = paddle.sum(
# pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
numerator = (
pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
numerator = paddle.sum(numerator.type_as(pad_targets)) denominator = paddle.sum(mask)
#TODO(Hui Zhang): sum not support bool type
# denominator = paddle.sum(mask)
denominator = paddle.sum(mask.type_as(pad_targets))
return float(numerator) / float(denominator) return float(numerator) / float(denominator)

Loading…
Cancel
Save