diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 2da68435c..99b8bcbe6 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -159,9 +159,7 @@ if not hasattr(paddle.Tensor, 'new_full'): def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor: if convert_dtype_to_string(xs.dtype) == paddle.bool: xs = xs.astype(paddle.int) - return xs.equal( - paddle.to_tensor( - ys, dtype=convert_dtype_to_string(xs.dtype), place=xs.place)) + return xs.equal(ys) if not hasattr(paddle.Tensor, 'eq'): @@ -219,13 +217,22 @@ def is_broadcastable(shp1, shp2): return True +def broadcast_shape(shp1, shp2): + result = [] + for a, b in zip(shp1[::-1], shp2[::-1]): + result.append(max(a, b)) + return result[::-1] + + def masked_fill(xs: paddle.Tensor, mask: paddle.Tensor, value: Union[float, int]): - assert is_broadcastable(xs.shape, mask.shape) is True, (xs.shape, - mask.shape) - bshape = paddle.broadcast_shape(xs.shape, mask.shape) - mask = mask.broadcast_to(bshape) + bshape = broadcast_shape(xs.shape, mask.shape) + mask.stop_gradient = True + tmp = paddle.ones(shape=[len(bshape)], dtype='int32') + for index in range(len(bshape)): + tmp[index] = bshape[index] + mask = mask.broadcast_to(tmp) trues = paddle.ones_like(xs) * value xs = paddle.where(mask, trues, xs) return xs diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 72300579f..ad73f5e99 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -253,8 +253,8 @@ class BaseEncoder(nn.Layer): # cnn_cache[i] = (B=1, hidden-dim, cache_t2) xs, _, new_att_cache, new_cnn_cache = layer( xs, att_mask, pos_emb, - att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, + att_cache=att_cache if elayers == 0 else att_cache[i:i+1], + cnn_cache=cnn_cache if paddle.shape(cnn_cache)[0] == 0 else cnn_cache[i], ) # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2)