Code clean up for CIs

pull/3242/head
jiamingkong 2 years ago
parent 3ef28dee45
commit 0e2068e2cf

@ -49,9 +49,6 @@ def _mha_shape_check(query: paddle.Tensor, key: paddle.Tensor, value: paddle.Ten
raise AssertionError(
f"query should be unbatched 2D or batched 3D tensor but received {query.dim()}-D query tensor")
def masked_fill(x, mask, value):
y = paddle.full(x.shape, value)
def scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal):
"""
@ -61,18 +58,22 @@ def scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal):
d_key = k.shape[-1]
scaled_q = paddle.scale(x=q, scale=d_key ** -0.5)
product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
weights = paddle.nn.functional.softmax(x=product + attn_mask)
weights = F.softmax(x=product + attn_mask)
if dropout_p:
weights = paddle.fluid.layers.nn.dropout(
weights = F.dropout(
weights,
dropout_prob=dropout_p,
dropout_implementation="upscale_in_train",
is_test=False)
p=dropout_p,
training=True,
mode="upscale_in_train"
)
out = paddle.matmul(x=weights, y=v)
return out
def addr(input, vec1, vec2, beta=1, alpha=1, out=None):
"""
A helper function to calculate alpha*(vec1*vec2^T) + beta*input
"""
row = vec1.shape[0]
column = vec2.shape[0]
vec1 = paddle.unsqueeze(vec1, 0)
@ -164,12 +165,11 @@ def _in_projection_packed(
- in output list :math:`[q', k', v']`, each output tensor will have the
same shape as the corresponding input tensor.
"""
# E = q.size(-1)
E = q.shape[-1]
if k is v:
if q is k:
# self-attention
proj = linear(q, w, b)
proj = F.linear(q, w, b)
# reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
proj = proj.unflatten(-1, (3, E)).unsqueeze(0).transpose([2, 1, 0]).squeeze(-2).contiguous()
return proj[0], proj[1], proj[2]
@ -180,8 +180,8 @@ def _in_projection_packed(
b_q = b_kv = None
else:
b_q, b_kv = b.split([E, E * 2])
q_proj = linear(q, w_q, b_q)
kv_proj = linear(k, w_kv, b_kv)
q_proj = F.linear(q, w_q, b_q)
kv_proj = F.linear(k, w_kv, b_kv)
# reshape to 2, E and not E, 2 is deliberate for better memory coalescing and keeping same order as chunk()
kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).transpose([2, 1, 0]).squeeze(-2).contiguous()
return (q_proj, kv_proj[0], kv_proj[1])
@ -191,7 +191,7 @@ def _in_projection_packed(
b_q = b_k = b_v = None
else:
b_q, b_k, b_v = b.chunk(3)
return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
return F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v)
def _in_projection(
q: paddle.Tensor,
@ -204,10 +204,8 @@ def _in_projection(
b_k: Optional[paddle.Tensor] = None,
b_v: Optional[paddle.Tensor] = None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
A, B, C = linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
A, B, C = F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v)
return A, B, C
# return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
def multi_head_attention_forward_paddle(
query: paddle.Tensor,
@ -299,22 +297,7 @@ def multi_head_attention_forward_paddle(
"""
is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
# For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
# is batched, run the computation and before returning squeeze the
# batch dimension so that the output doesn't carry this temporary batch dimension.
# if not is_batched:
# # unsqueeze if the input is unbatched
# query = query.unsqueeze(1)
# key = key.unsqueeze(1)
# value = value.unsqueeze(1)
# if key_padding_mask is not None:
# key_padding_mask = key_padding_mask.unsqueeze(0)
# set up shape vars
# import pdb; pdb.set_trace()
tgt_len, bsz, embed_dim = query.shape
# tgt_len, bsz, embed_dim = query.shape
src_len, _, _ = key.shape
if is_causal:
@ -373,9 +356,7 @@ def multi_head_attention_forward_paddle(
if bias_k is not None and bias_v is not None:
assert static_k is None, "bias cannot be added to static key."
assert static_v is None, "bias cannot be added to static value."
# k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
k = paddle.concat([k, bias_k.repeat(1, bsz, 1)], axis=1)
# v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
v = paddle.concat([v, bias_v.repeat(1, bsz, 1)], axis=1)
if attn_mask is not None:
# attn_mask = pad(attn_mask, (0, 1))
@ -392,22 +373,18 @@ def multi_head_attention_forward_paddle(
#
# reshape q, k, v for multihead attention and make em batch first
#
# q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
q = q.reshape([tgt_len, bsz * num_heads, head_dim]).transpose([1, 0, 2])
if static_k is None:
# k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
k = k.reshape([k.shape[0], bsz * num_heads, head_dim]).transpose([1, 0, 2])
else:
# TODO finish disentangling control flow so we don't do in-projections when statics are passed
assert static_k.size(0) == bsz * num_heads, \
f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
assert static_k.size(2) == head_dim, \
f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
k = static_k
if static_v is None:
# v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
v = v.reshape([v.shape[0], bsz * num_heads, head_dim]).transpose([1, 0, 2])
else:
# TODO finish disentangling control flow so we don't do in-projections when statics are passed
@ -420,9 +397,7 @@ def multi_head_attention_forward_paddle(
# add zero attention along batch dimension (now first)
if add_zero_attn:
zero_attn_shape = (bsz * num_heads, 1, head_dim)
# k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
k = paddle.concat([k, paddle.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], axis=1)
# v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1)
v = paddle.concat([v, paddle.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], axis=1)
if attn_mask is not None:
# attn_mask = pad(attn_mask, (0, 1))
@ -438,7 +413,6 @@ def multi_head_attention_forward_paddle(
if key_padding_mask is not None:
assert key_padding_mask.shape == (bsz, src_len), \
f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
# key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
key_padding_mask = key_padding_mask.reshape([bsz, 1, 1, src_len]).expand([-1, num_heads, -1, -1]).reshape([bsz * num_heads, 1, src_len])
if attn_mask is None:
attn_mask = key_padding_mask
@ -456,25 +430,20 @@ def multi_head_attention_forward_paddle(
B, Nt, E = q.shape
q_scaled = q / math.sqrt(E)
if attn_mask is not None:
# attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
attn_output_weights = addr(q_scaled, k.transpose(-2, -1))
else:
# attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
attn_output_weights = paddle.bmm(q_scaled, k.transpose(0, 2, 1))
# attn_output_weights = softmax(attn_output_weights, dim=-1)
attn_output_weights = paddle.nn.functional.softmax(attn_output_weights, axis=-1)
attn_output_weights = F.softmax(attn_output_weights, axis=-1)
if dropout_p > 0.0:
# attn_output_weights = dropout(attn_output_weights, p=dropout_p)
attn_output_weights = paddle.nn.functional.dropout(attn_output_weights, p=dropout_p)
attn_output_weights = F.dropout(attn_output_weights, p=dropout_p)
# attn_output = torch.bmm(attn_output_weights, v)
attn_output = paddle.bmm(attn_output_weights, v)
attn_output = attn_output.transpose([1, 0, 2]).reshape([tgt_len * bsz, embed_dim])
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
# attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
attn_output = attn_output.reshape([tgt_len, bsz, attn_output.shape[1]])
# optionally average attention weights over heads
# attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
attn_output_weights = attn_output_weights.reshape([bsz, num_heads, tgt_len, src_len])
if average_attn_weights:
attn_output_weights = attn_output_weights.mean(dim=1)
@ -492,7 +461,6 @@ def multi_head_attention_forward_paddle(
if attn_mask.shape[0] == 1 and attn_mask.dim() == 3:
attn_mask = attn_mask.unsqueeze(0)
else:
# attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
attn_mask = attn_mask.reshape([bsz, num_heads, -1, src_len])
q = q.reshape([bsz, num_heads, tgt_len, head_dim])
@ -500,9 +468,6 @@ def multi_head_attention_forward_paddle(
v = v.reshape([bsz, num_heads, src_len, head_dim])
attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
attn_output = attn_output.transpose(perm=[2, 0, 1, 3]).reshape([bsz * tgt_len, embed_dim])
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
attn_output = attn_output.reshape([tgt_len, bsz, attn_output.shape[1]])
# if not is_batched:
# # squeeze the output if input was unbatched
# attn_output = attn_output.squeeze(1)
return attn_output, None

@ -60,19 +60,6 @@ class Fp32GroupNorm(nn.GroupNorm):
return output.type_as(input)
# class GradMultiply(torch.autograd.Function):
# convert into paddle equivalent
# class GradMultiply(torch.autograd.Function):
# @staticmethod
# def forward(ctx, x, scale):
# ctx.scale = scale
# res = x.new(x)
# return res
# @staticmethod
# def backward(ctx, grad):
# return grad * ctx.scale, None
class SamePad(nn.Layer):
def __init__(self, kernel_size, causal=False):
@ -95,7 +82,6 @@ class Swish(nn.Layer):
def __init__(self):
"""Construct an MultiHeadedAttention object."""
super(Swish, self).__init__()
# self.act = torch.nn.Sigmoid()
self.act = nn.Sigmoid()
def forward(self, x):
@ -162,7 +148,6 @@ def get_activation_fn(activation: str):
elif activation == "gelu_accurate":
return gelu_accurate
elif activation == "tanh":
# return torch.tanh
return paddle.tanh
elif activation == "linear":
return lambda x: x
@ -172,44 +157,6 @@ def get_activation_fn(activation: str):
raise RuntimeError("--activation-fn {} not supported".format(activation))
def init_bert_params(module):
"""
Initialize the weights specific to the BERT Model.
This overrides the default initializations depending on the specified arguments.
1. If normal_init_linear_weights is set then weights of linear
layer will be initialized using the normal distribution and
bais will be set to the specified value.
2. If normal_init_embed_weights is set then weights of embedding
layer will be initialized using the normal distribution.
3. If normal_init_proj_weights is set then weights of
in_project_weight for MultiHeadAttention initialized using
the normal distribution (to be validated).
"""
def normal_(data):
# with FSDP, module params will be on CUDA, so we cast them back to CPU
# so that the RNG is consistent with and without FSDP
data.copy_(
data.cpu().normal_(mean=0.0, std=0.02).to(data.device)
)
if isinstance(module, nn.Linear):
# normal_(module.weight.data)
if module.bias is not None:
# module.bias.data.zero_()
pass
if isinstance(module, nn.Embedding):
# normal_(module.weight.data)
if module.padding_idx is not None:
# module.weight.data[module.padding_idx].zero_()
pass
if isinstance(module, MultiheadAttention):
pass
# normal_(module.q_proj.weight.data)
# normal_(module.k_proj.weight.data)
# normal_(module.v_proj.weight.data)
def quant_noise(module, p, block_size):
"""
Wraps modules and applies quantization noise to the weights for
@ -302,9 +249,8 @@ def quant_noise(module, p, block_size):
# scale weights and apply mask
mask = mask.to(
# torch.bool
paddle.bool
) # x.bool() is not currently supported in TorchScript
)
s = 1 / (1 - p)
mod.weight.data = s * weight.masked_fill(mask, 0)
@ -405,7 +351,6 @@ class MultiheadAttention(nn.Layer):
self.gru_rel_pos = gru_rel_pos
if self.gru_rel_pos:
self.grep_linear = nn.Linear(self.q_head_dim, 8)
# self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1))
self.grep_a = self.create_parameter(
shape=[1, num_heads, 1, 1], dtype="float32"
)
@ -415,48 +360,7 @@ class MultiheadAttention(nn.Layer):
def reset_parameters(self):
pass
# if self.qkv_same_dim:
# # Empirically observed the convergence to be much better with
# # the scaled initialization
# # nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
# # self.k_proj.weight.set_value(
# # paddle.nn.initializer.XavierUniform(1, 1)(self.k_proj.weight.shape)
# # )
# # self.v_proj.weight.set_value(
# # paddle.nn.initializer.XavierUniform(1, 1)(self.v_proj.weight.shape)
# # )
# # self.q_proj.weight.set_value(
# # paddle.nn.initializer.XavierUniform(1, 1)(self.q_proj.weight.shape)
# # )
# pass
# # nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
# # nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
# else:
# # nn.init.xavier_uniform_(self.k_proj.weight)
# # nn.init.xavier_uniform_(self.v_proj.weight)
# # nn.init.xavier_uniform_(self.q_proj.weight)
# # self.k_proj.weight.set_value(
# # paddle.nn.initializer.XavierUniform()(self.k_proj.weight.shape)
# # )
# # self.v_proj.weight.set_value(
# # paddle.nn.initializer.XavierUniform()(self.v_proj.weight.shape)
# # )
# # self.q_proj.weight.set_value(
# # paddle.nn.initializer.XavierUniform()(self.q_proj.weight.shape)
# # )
# pass
# nn.init.xavier_uniform_(self.out_proj.weight)
# if self.out_proj.bias is not None:
# nn.init.constant_(self.out_proj.bias, 0.0)
# if self.bias_k is not None:
# nn.init.xavier_normal_(self.bias_k)
# if self.bias_v is not None:
# nn.init.xavier_normal_(self.bias_v)
# if self.has_relative_attention_bias:
# nn.init.xavier_normal_(self.relative_attention_bias.weight)
def _relative_positions_bucket(self, relative_positions, bidirectional=True):
num_buckets = self.num_buckets
max_distance = self.max_distance
@ -544,7 +448,6 @@ class MultiheadAttention(nn.Layer):
position_bias = paddle.concat([position_bias_ for _ in range(bsz)], axis=0)
position_bias = position_bias.reshape([bsz * self.num_heads, tgt_len, src_len])
if (
# not is_tpu # don't use PyTorch version on TPUs
incremental_state is None
and not static_kv
and self.q_head_dim == self.head_dim
@ -740,7 +643,6 @@ class MultiheadAttention(nn.Layer):
)
# attn_weights = torch.bmm(q, k.transpose(1, 2))
attn_weights = paddle.bmm(q, k.transpose(1, 2))
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
@ -753,16 +655,10 @@ class MultiheadAttention(nn.Layer):
if key_padding_mask is not None:
# don't attend to padding symbols
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
if not is_tpu:
attn_weights = attn_weights.masked_fill(
# key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
key_padding_mask.unsqueeze(1).unsqueeze(2).to(paddle.bool),
float("-inf"),
)
else:
attn_weights = attn_weights.transpose(0, 2)
attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
attn_weights = attn_weights.transpose(0, 2)
attn_weights = attn_weights.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2).to(paddle.bool),
float("-inf"),
)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if before_softmax:
@ -772,8 +668,6 @@ class MultiheadAttention(nn.Layer):
if self.gru_rel_pos == 1:
query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim)
_B, _H, _L, __ = query_layer.shape
# gate_a, gate_b = torch.sigmoid(self.grep_linear(query_layer).view(
# _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, dim=-1)
gate_a, gate_b = paddle.sigmoid(self.grep_linear(query_layer).view(
_B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, axis=-1)
@ -791,7 +685,6 @@ class MultiheadAttention(nn.Layer):
attn_probs = self.dropout_module(attn_weights)
assert v is not None
# attn = torch.bmm(attn_probs, v)
attn = paddle.bmm(attn_probs, v)
assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
@ -819,9 +712,6 @@ class MultiheadAttention(nn.Layer):
if prev_key_padding_mask is not None and static_kv:
new_key_padding_mask = prev_key_padding_mask
elif prev_key_padding_mask is not None and key_padding_mask is not None:
# new_key_padding_mask = torch.cat(
# [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
# )
new_key_padding_mask = paddle.concat(
[prev_key_padding_mask.float(), key_padding_mask.float()], axis=1
)
@ -830,18 +720,10 @@ class MultiheadAttention(nn.Layer):
# is None
elif prev_key_padding_mask is not None:
if src_len > prev_key_padding_mask.size(1):
# filler = torch.zeros(
# (batch_size, src_len - prev_key_padding_mask.size(1)),
# device=prev_key_padding_mask.device,
# )
filler = paddle.zeros(
(batch_size, src_len - prev_key_padding_mask.size(1)),
device=prev_key_padding_mask.device,
)
# new_key_padding_mask = torch.cat(
# [prev_key_padding_mask.float(), filler.float()], dim=1
# )
new_key_padding_mask = paddle.concat(
[prev_key_padding_mask.float(), filler.float()], axis=1
)
@ -850,17 +732,10 @@ class MultiheadAttention(nn.Layer):
new_key_padding_mask = prev_key_padding_mask.float()
elif key_padding_mask is not None:
if src_len > key_padding_mask.size(1):
# filler = torch.zeros(
# (batch_size, src_len - key_padding_mask.size(1)),
# device=key_padding_mask.device,
# )
filler = paddle.zeros(
(batch_size, src_len - key_padding_mask.size(1)),
device=key_padding_mask.device,
)
# new_key_padding_mask = torch.cat(
# [filler.float(), key_padding_mask.float()], dim=1
# )
new_key_padding_mask = paddle.concat(
[filler.float(), key_padding_mask.float()], axis=1
)

@ -21,7 +21,6 @@ from paddle import Tensor
from .modules.modules import (
MultiheadAttention,
SamePad,
init_bert_params,
get_activation_fn,
TransposeLast,
GLU_Linear,

Loading…
Cancel
Save