NumPy compatibility enhancements (#3907)

* NumPy compatibility enhancements

* update code
pull/3918/head
Wang Xin 4 weeks ago committed by GitHub
parent e22173f739
commit 61728f8db6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -129,7 +129,7 @@ def _compute_mask_indices(
[sequence_length for _ in range(batch_size)]) [sequence_length for _ in range(batch_size)])
# SpecAugment mask to fill # SpecAugment mask to fill
spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool) spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool_)
spec_aug_mask_idxs = [] spec_aug_mask_idxs = []
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
@ -207,9 +207,9 @@ def _sample_negative_indices(features_shape: Tuple,
sampled_negative_indices = np.zeros( sampled_negative_indices = np.zeros(
shape=(batch_size, sequence_length, num_negatives), dtype=np.int32) shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
mask_time_indices = (mask_time_indices.astype(np.bool) mask_time_indices = (mask_time_indices.astype(np.bool_)
if mask_time_indices is not None else if mask_time_indices is not None else
np.ones(features_shape, dtype=np.bool)) np.ones(features_shape, dtype=np.bool_))
for batch_idx in range(batch_size): for batch_idx in range(batch_size):
high = mask_time_indices[batch_idx].sum() - 1 high = mask_time_indices[batch_idx].sum() - 1

@ -1476,7 +1476,7 @@ def compute_mask_indices(
lens = np.fromiter( lens = np.fromiter(
(e - s if e - s >= length + min_space else 0 (e - s if e - s >= length + min_space else 0
for s, e in parts), for s, e in parts),
np.int, ) np.int_, )
l_sum = np.sum(lens) l_sum = np.sum(lens)
if l_sum == 0: if l_sum == 0:
break break

@ -6,25 +6,24 @@
# Based on fairseq code bases # Based on fairseq code bases
# https://github.com/pytorch/fairseq # https://github.com/pytorch/fairseq
# -------------------------------------------------------- # --------------------------------------------------------
import math
import logging import logging
from typing import List, Optional, Tuple import math
from typing import List
from typing import Optional
from typing import Tuple
import numpy as np import numpy as np
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle.nn import LayerNorm
from paddle import Tensor from paddle import Tensor
from .modules.modules import ( from paddle.nn import LayerNorm
MultiheadAttention,
SamePad, from .modules.modules import get_activation_fn
get_activation_fn, from .modules.modules import GLU_Linear
TransposeLast, from .modules.modules import MultiheadAttention
GLU_Linear, from .modules.modules import SamePad
) from .modules.modules import TransposeLast
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -34,12 +33,11 @@ def compute_mask_indices(
padding_mask: Optional[Tensor], padding_mask: Optional[Tensor],
mask_prob: float, mask_prob: float,
mask_length: int, mask_length: int,
mask_type: str = "static", mask_type: str="static",
mask_other: float = 0.0, mask_other: float=0.0,
min_masks: int = 0, min_masks: int=0,
no_overlap: bool = False, no_overlap: bool=False,
min_space: int = 0, min_space: int=0, ) -> np.ndarray:
) -> np.ndarray:
""" """
Computes random mask spans for a given shape Computes random mask spans for a given shape
@ -65,9 +63,7 @@ def compute_mask_indices(
all_num_mask = int( all_num_mask = int(
# add a random number for probabilistic rounding # add a random number for probabilistic rounding
mask_prob * all_sz / float(mask_length) mask_prob * all_sz / float(mask_length) + np.random.rand())
+ np.random.rand()
)
all_num_mask = max(min_masks, all_num_mask) all_num_mask = max(min_masks, all_num_mask)
@ -77,9 +73,7 @@ def compute_mask_indices(
sz = all_sz - padding_mask[i].long().sum().item() sz = all_sz - padding_mask[i].long().sum().item()
num_mask = int( num_mask = int(
# add a random number for probabilistic rounding # add a random number for probabilistic rounding
mask_prob * sz / float(mask_length) mask_prob * sz / float(mask_length) + np.random.rand())
+ np.random.rand()
)
num_mask = max(min_masks, num_mask) num_mask = max(min_masks, num_mask)
else: else:
sz = all_sz sz = all_sz
@ -88,7 +82,8 @@ def compute_mask_indices(
if mask_type == "static": if mask_type == "static":
lengths = np.full(num_mask, mask_length) lengths = np.full(num_mask, mask_length)
elif mask_type == "uniform": elif mask_type == "uniform":
lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask) lengths = np.random.randint(
mask_other, mask_length * 2 + 1, size=num_mask)
elif mask_type == "normal": elif mask_type == "normal":
lengths = np.random.normal(mask_length, mask_other, size=num_mask) lengths = np.random.normal(mask_length, mask_other, size=num_mask)
lengths = [max(1, int(round(x))) for x in lengths] lengths = [max(1, int(round(x))) for x in lengths]
@ -119,9 +114,9 @@ def compute_mask_indices(
min_length = min(lengths) min_length = min(lengths)
for length in sorted(lengths, reverse=True): for length in sorted(lengths, reverse=True):
lens = np.fromiter( lens = np.fromiter(
(e - s if e - s >= length + min_space else 0 for s, e in parts), (e - s if e - s >= length + min_space else 0
np.int, for s, e in parts),
) np.int_, )
l_sum = np.sum(lens) l_sum = np.sum(lens)
if l_sum == 0: if l_sum == 0:
break break
@ -137,13 +132,10 @@ def compute_mask_indices(
mask_idc = np.random.choice(sz - min_len, num_mask, replace=False) mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
mask_idc = np.asarray( mask_idc = np.asarray([
[
mask_idc[j] + offset mask_idc[j] + offset
for j in range(len(mask_idc)) for j in range(len(mask_idc)) for offset in range(lengths[j])
for offset in range(lengths[j]) ])
]
)
mask_idcs.append(np.unique(mask_idc[mask_idc < sz])) mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
@ -217,8 +209,7 @@ class WavLMConfig:
class WavLM(nn.Layer): class WavLM(nn.Layer):
def __init__( def __init__(
self, self,
cfg: WavLMConfig, cfg: WavLMConfig, ) -> None:
) -> None:
super().__init__() super().__init__()
logger.info(f"WavLM Config: {cfg.__dict__}") logger.info(f"WavLM Config: {cfg.__dict__}")
@ -230,14 +221,11 @@ class WavLM(nn.Layer):
conv_layers=feature_enc_layers, conv_layers=feature_enc_layers,
dropout=0.0, dropout=0.0,
mode=cfg.extractor_mode, mode=cfg.extractor_mode,
conv_bias=cfg.conv_bias, conv_bias=cfg.conv_bias, )
)
self.post_extract_proj = ( self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else
if self.embed != cfg.encoder_embed_dim None)
else None
)
self.mask_prob = cfg.mask_prob self.mask_prob = cfg.mask_prob
self.mask_selection = cfg.mask_selection self.mask_selection = cfg.mask_selection
@ -260,8 +248,7 @@ class WavLM(nn.Layer):
self.mask_emb = self.create_parameter( self.mask_emb = self.create_parameter(
shape=[cfg.encoder_embed_dim], shape=[cfg.encoder_embed_dim],
default_initializer=nn.initializer.Uniform(), default_initializer=nn.initializer.Uniform(), )
)
self.encoder = TransformerEncoder(cfg) self.encoder = TransformerEncoder(cfg)
self.layer_norm = LayerNorm(self.embed) self.layer_norm = LayerNorm(self.embed)
@ -278,8 +265,7 @@ class WavLM(nn.Layer):
self.mask_other, self.mask_other,
min_masks=2, min_masks=2,
no_overlap=self.no_mask_overlap, no_overlap=self.no_mask_overlap,
min_space=self.mask_min_space, min_space=self.mask_min_space, )
)
# mask_indices = torch.from_numpy(mask_indices).to(x.device) # mask_indices = torch.from_numpy(mask_indices).to(x.device)
mask_indices = paddle.to_tensor(mask_indices, dtype='int64') mask_indices = paddle.to_tensor(mask_indices, dtype='int64')
x[mask_indices] = self.mask_emb x[mask_indices] = self.mask_emb
@ -295,40 +281,35 @@ class WavLM(nn.Layer):
self.mask_channel_selection, self.mask_channel_selection,
self.mask_channel_other, self.mask_channel_other,
no_overlap=self.no_mask_channel_overlap, no_overlap=self.no_mask_channel_overlap,
min_space=self.mask_channel_min_space, min_space=self.mask_channel_min_space, )
)
mask_channel_indices = ( mask_channel_indices = (
# torch.from_numpy(mask_channel_indices) # torch.from_numpy(mask_channel_indices)
paddle.to_tensor(mask_channel_indices, dtype='int64') paddle.to_tensor(mask_channel_indices, dtype='int64')
.to(x.device) .to(x.device).unsqueeze(1).expand(-1, T, -1))
.unsqueeze(1)
.expand(-1, T, -1)
)
x[mask_channel_indices] = 0 x[mask_channel_indices] = 0
return x, mask_indices return x, mask_indices
def forward_padding_mask( def forward_padding_mask(
self, features: Tensor, padding_mask: Tensor, self,
) -> Tensor: features: Tensor,
padding_mask: Tensor, ) -> Tensor:
extra = padding_mask.size(1) % features.size(1) extra = padding_mask.size(1) % features.size(1)
if extra > 0: if extra > 0:
padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask[:, :-extra]
padding_mask = padding_mask.view( padding_mask = padding_mask.view(
padding_mask.size(0), features.size(1), -1 padding_mask.size(0), features.size(1), -1)
)
padding_mask = padding_mask.all(-1) padding_mask = padding_mask.all(-1)
return padding_mask return padding_mask
def extract_features( def extract_features(
self, self,
source: Tensor, source: Tensor,
padding_mask: Optional[Tensor] = None, padding_mask: Optional[Tensor]=None,
mask: bool = False, mask: bool=False,
ret_conv: bool = False, ret_conv: bool=False,
output_layer: Optional[int] = None, output_layer: Optional[int]=None,
ret_layer_results: bool = False, ret_layer_results: bool=False, ):
):
if self.feature_grad_mult > 0: if self.feature_grad_mult > 0:
features = self.feature_extractor(source) features = self.feature_extractor(source)
@ -351,9 +332,7 @@ class WavLM(nn.Layer):
features = self.dropout_input(features) features = self.dropout_input(features)
if mask: if mask:
x, mask_indices = self.apply_mask( x, mask_indices = self.apply_mask(features, padding_mask)
features, padding_mask
)
else: else:
x = features x = features
@ -366,10 +345,14 @@ class WavLM(nn.Layer):
x, layer_results = self.encoder( x, layer_results = self.encoder(
x, x,
padding_mask=padding_mask, padding_mask=padding_mask,
layer=None if output_layer is None else output_layer - 1 layer=None if output_layer is None else output_layer - 1)
)
# print(f"Debugging: x.shape: {x.shape}, x.mean(): {x.mean()}, x.std(): {x.std()}") # print(f"Debugging: x.shape: {x.shape}, x.mean(): {x.mean()}, x.std(): {x.std()}")
res = {"x": x, "padding_mask": padding_mask, "features": features, "layer_results": layer_results} res = {
"x": x,
"padding_mask": padding_mask,
"features": features,
"layer_results": layer_results
}
feature = res["features"] if ret_conv else res["x"] feature = res["features"] if ret_conv else res["x"]
if ret_layer_results: if ret_layer_results:
@ -381,14 +364,12 @@ class WavLM(nn.Layer):
class ConvFeatureExtractionModel(nn.Layer): class ConvFeatureExtractionModel(nn.Layer):
def __init__( def __init__(self,
self,
conv_layers: List[Tuple[int, int, int]], conv_layers: List[Tuple[int, int, int]],
dropout: float = 0.0, dropout: float=0.0,
mode: str = "default", mode: str="default",
conv_bias: bool = False, conv_bias: bool=False,
conv_type: str = "default" conv_type: str="default"):
):
super().__init__() super().__init__()
assert mode in {"default", "layer_norm"} assert mode in {"default", "layer_norm"}
@ -400,16 +381,19 @@ class ConvFeatureExtractionModel(nn.Layer):
stride, stride,
is_layer_norm=False, is_layer_norm=False,
is_group_norm=False, is_group_norm=False,
conv_bias=False, conv_bias=False, ):
):
def make_conv(): def make_conv():
conv = nn.Conv1D(n_in, n_out, k, stride=stride, bias_attr=conv_bias, conv = nn.Conv1D(
n_in,
n_out,
k,
stride=stride,
bias_attr=conv_bias,
weight_attr=nn.initializer.KaimingNormal()) weight_attr=nn.initializer.KaimingNormal())
# nn.init.kaiming_normal_(conv.weight) # nn.init.kaiming_normal_(conv.weight)
return conv return conv
assert ( assert (is_layer_norm and is_group_norm
is_layer_norm and is_group_norm
) == False, "layer norm and group norm are exclusive" ) == False, "layer norm and group norm are exclusive"
if is_layer_norm: if is_layer_norm:
@ -419,19 +403,18 @@ class ConvFeatureExtractionModel(nn.Layer):
nn.Sequential( nn.Sequential(
TransposeLast(), TransposeLast(),
nn.LayerNorm(normalized_shape=dim, epsilon=1e-5), nn.LayerNorm(normalized_shape=dim, epsilon=1e-5),
TransposeLast(), TransposeLast(), ),
), nn.GELU(), )
nn.GELU(),
)
elif is_group_norm: elif is_group_norm:
return nn.Sequential( return nn.Sequential(
make_conv(), make_conv(),
nn.Dropout(p=dropout), nn.Dropout(p=dropout),
nn.GroupNorm(num_groups=dim, num_channels=dim, epsilon=1e-5), nn.GroupNorm(
nn.GELU(), num_groups=dim, num_channels=dim, epsilon=1e-5),
) nn.GELU(), )
else: else:
return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU()) return nn.Sequential(
make_conv(), nn.Dropout(p=dropout), nn.GELU())
self.conv_type = conv_type self.conv_type = conv_type
if self.conv_type == "default": if self.conv_type == "default":
@ -449,9 +432,7 @@ class ConvFeatureExtractionModel(nn.Layer):
stride, stride,
is_layer_norm=mode == "layer_norm", is_layer_norm=mode == "layer_norm",
is_group_norm=mode == "default" and i == 0, is_group_norm=mode == "default" and i == 0,
conv_bias=conv_bias, conv_bias=conv_bias, ))
)
)
in_d = dim in_d = dim
elif self.conv_type == "conv2d": elif self.conv_type == "conv2d":
in_d = 1 in_d = 1
@ -460,9 +441,7 @@ class ConvFeatureExtractionModel(nn.Layer):
assert len(cl) == 3 assert len(cl) == 3
(dim, k, stride) = cl (dim, k, stride) = cl
self.conv_layers.append( self.conv_layers.append(paddle.nn.Conv2D(in_d, dim, k, stride))
paddle.nn.Conv2D(in_d, dim, k, stride)
)
self.conv_layers.append(paddle.nn.ReLU()) self.conv_layers.append(paddle.nn.ReLU())
in_d = dim in_d = dim
elif self.conv_type == "custom": elif self.conv_type == "custom":
@ -473,17 +452,13 @@ class ConvFeatureExtractionModel(nn.Layer):
assert len(cl) == 3 assert len(cl) == 3
(dim, k, stride) = cl (dim, k, stride) = cl
self.conv_layers.append( self.conv_layers.append(
paddle.nn.Conv2D(in_d, dim, k, stride, padding=1) paddle.nn.Conv2D(in_d, dim, k, stride, padding=1))
) self.conv_layers.append(paddle.nn.LayerNorm([dim, idim]))
self.conv_layers.append(
paddle.nn.LayerNorm([dim, idim])
)
self.conv_layers.append(paddle.nn.ReLU()) self.conv_layers.append(paddle.nn.ReLU())
in_d = dim in_d = dim
if (i + 1) % 2 == 0: if (i + 1) % 2 == 0:
self.conv_layers.append( self.conv_layers.append(
paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True) paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True))
)
idim = int(math.ceil(idim / 2)) idim = int(math.ceil(idim / 2))
else: else:
pass pass
@ -518,8 +493,8 @@ class TransformerEncoder(nn.Layer):
self.dropout = args.dropout self.dropout = args.dropout
self.embedding_dim = args.encoder_embed_dim self.embedding_dim = args.encoder_embed_dim
dropout = 0 dropout = 0
std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) std = math.sqrt(
(4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
self.pos_conv = nn.Conv1D( self.pos_conv = nn.Conv1D(
self.embedding_dim, self.embedding_dim,
@ -528,15 +503,16 @@ class TransformerEncoder(nn.Layer):
padding=args.conv_pos // 2, padding=args.conv_pos // 2,
groups=args.conv_pos_groups, groups=args.conv_pos_groups,
weight_attr=nn.initializer.Normal(mean=0, std=std), weight_attr=nn.initializer.Normal(mean=0, std=std),
bias_attr=True bias_attr=True)
)
# nn.init.normal_(self.pos_conv.weight, mean=0, std=std) # nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
# nn.init.constant_(self.pos_conv.bias, 0) # nn.init.constant_(self.pos_conv.bias, 0)
# self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) # self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
# self.pos_conv.weight_g = self.pos_conv.weight_g.unsqueeze(0).unsqueeze(0) # self.pos_conv.weight_g = self.pos_conv.weight_g.unsqueeze(0).unsqueeze(0)
self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) self.pos_conv = nn.utils.weight_norm(
self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) self.pos_conv, name="weight", dim=2)
self.pos_conv = nn.Sequential(self.pos_conv,
SamePad(args.conv_pos), nn.GELU())
if hasattr(args, "relative_position_embedding"): if hasattr(args, "relative_position_embedding"):
self.relative_position_embedding = args.relative_position_embedding self.relative_position_embedding = args.relative_position_embedding
@ -547,8 +523,7 @@ class TransformerEncoder(nn.Layer):
self.num_buckets = 0 self.num_buckets = 0
self.max_distance = 0 self.max_distance = 0
self.layers = nn.LayerList( self.layers = nn.LayerList([
[
TransformerSentenceEncoderLayer( TransformerSentenceEncoderLayer(
embedding_dim=self.embedding_dim, embedding_dim=self.embedding_dim,
ffn_embedding_dim=args.encoder_ffn_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim,
@ -558,14 +533,13 @@ class TransformerEncoder(nn.Layer):
activation_dropout=args.activation_dropout, activation_dropout=args.activation_dropout,
activation_fn=args.activation_fn, activation_fn=args.activation_fn,
layer_norm_first=args.layer_norm_first, layer_norm_first=args.layer_norm_first,
has_relative_attention_bias=(self.relative_position_embedding and i == 0), has_relative_attention_bias=(
self.relative_position_embedding and i == 0),
num_buckets=self.num_buckets, num_buckets=self.num_buckets,
max_distance=self.max_distance, max_distance=self.max_distance,
gru_rel_pos=args.gru_rel_pos, gru_rel_pos=args.gru_rel_pos, )
)
for i in range(args.encoder_layers) for i in range(args.encoder_layers)
] ])
)
self.layer_norm_first = args.layer_norm_first self.layer_norm_first = args.layer_norm_first
self.layer_norm = LayerNorm(self.embedding_dim) self.layer_norm = LayerNorm(self.embedding_dim)
@ -574,14 +548,19 @@ class TransformerEncoder(nn.Layer):
# self.apply(init_bert_params) # self.apply(init_bert_params)
def forward(self, x, padding_mask=None, streaming_mask=None, layer=None): def forward(self, x, padding_mask=None, streaming_mask=None, layer=None):
x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer) x, layer_results = self.extract_features(x, padding_mask,
streaming_mask, layer)
# print("x.shape", x.shape) # print("x.shape", x.shape)
if self.layer_norm_first and layer is None: if self.layer_norm_first and layer is None:
x = self.layer_norm(x) x = self.layer_norm(x)
return x, layer_results return x, layer_results
def extract_features(self, x, padding_mask=None, streaming_mask=None, tgt_layer=None): def extract_features(self,
x,
padding_mask=None,
streaming_mask=None,
tgt_layer=None):
if padding_mask is not None: if padding_mask is not None:
x[padding_mask] = 0 x[padding_mask] = 0
@ -598,7 +577,6 @@ class TransformerEncoder(nn.Layer):
# x = x.transpose(0, 1) # x = x.transpose(0, 1)
x = x.transpose([1, 0, 2]) x = x.transpose([1, 0, 2])
layer_results = [] layer_results = []
z = None z = None
if tgt_layer is not None: if tgt_layer is not None:
@ -608,7 +586,12 @@ class TransformerEncoder(nn.Layer):
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
dropout_probability = np.random.random() dropout_probability = np.random.random()
if not self.training or (dropout_probability > self.layerdrop): if not self.training or (dropout_probability > self.layerdrop):
x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False,self_attn_mask=streaming_mask, pos_bias=pos_bias) x, z, pos_bias = layer(
x,
self_attn_padding_mask=padding_mask,
need_weights=False,
self_attn_mask=streaming_mask,
pos_bias=pos_bias)
if tgt_layer is not None: if tgt_layer is not None:
layer_results.append((x, z)) layer_results.append((x, z))
if i == tgt_layer: if i == tgt_layer:
@ -633,20 +616,19 @@ class TransformerSentenceEncoderLayer(nn.Layer):
def __init__( def __init__(
self, self,
embedding_dim: float = 768, embedding_dim: float=768,
ffn_embedding_dim: float = 3072, ffn_embedding_dim: float=3072,
num_attention_heads: float = 8, num_attention_heads: float=8,
dropout: float = 0.1, dropout: float=0.1,
attention_dropout: float = 0.1, attention_dropout: float=0.1,
activation_dropout: float = 0.1, activation_dropout: float=0.1,
activation_fn: str = "relu", activation_fn: str="relu",
layer_norm_first: bool = False, layer_norm_first: bool=False,
has_relative_attention_bias: bool = True, has_relative_attention_bias: bool=True,
num_buckets: int = 0, num_buckets: int=0,
max_distance: int = 0, max_distance: int=0,
rescale_init: bool = False, rescale_init: bool=False,
gru_rel_pos: bool = True, gru_rel_pos: bool=True, ) -> None:
) -> None:
super().__init__() super().__init__()
# Initialize parameters # Initialize parameters
@ -666,8 +648,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
num_buckets=num_buckets, num_buckets=num_buckets,
max_distance=max_distance, max_distance=max_distance,
rescale_init=rescale_init, rescale_init=rescale_init,
gru_rel_pos=gru_rel_pos, gru_rel_pos=gru_rel_pos, )
)
self.dropout1 = nn.Dropout(dropout) self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(self.activation_dropout) self.dropout2 = nn.Dropout(self.activation_dropout)
@ -679,7 +660,8 @@ class TransformerSentenceEncoderLayer(nn.Layer):
self.self_attn_layer_norm = LayerNorm(self.embedding_dim) self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
if self.activation_name == "glu": if self.activation_name == "glu":
self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish") self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim,
"swish")
else: else:
self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
@ -687,14 +669,12 @@ class TransformerSentenceEncoderLayer(nn.Layer):
# layer norm associated with the position wise feed-forward NN # layer norm associated with the position wise feed-forward NN
self.final_layer_norm = LayerNorm(self.embedding_dim) self.final_layer_norm = LayerNorm(self.embedding_dim)
def forward( def forward(self,
self,
x: Tensor, x: Tensor,
self_attn_mask: Tensor = None, self_attn_mask: Tensor=None,
self_attn_padding_mask: Tensor = None, self_attn_padding_mask: Tensor=None,
need_weights: bool = False, need_weights: bool=False,
pos_bias=None pos_bias=None):
):
""" """
LayerNorm is applied either before or after the self-attention/ffn LayerNorm is applied either before or after the self-attention/ffn
modules similar to the original Transformer imlementation. modules similar to the original Transformer imlementation.
@ -710,8 +690,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
key_padding_mask=self_attn_padding_mask, key_padding_mask=self_attn_padding_mask,
need_weights=False, need_weights=False,
attn_mask=self_attn_mask, attn_mask=self_attn_mask,
position_bias=pos_bias position_bias=pos_bias)
)
# import pdb; pdb.set_trace() # import pdb; pdb.set_trace()
x = self.dropout1(x) x = self.dropout1(x)
x = residual + x x = residual + x
@ -734,8 +713,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
key_padding_mask=self_attn_padding_mask, key_padding_mask=self_attn_padding_mask,
need_weights=need_weights, need_weights=need_weights,
attn_mask=self_attn_mask, attn_mask=self_attn_mask,
position_bias=pos_bias position_bias=pos_bias)
)
x = self.dropout1(x) x = self.dropout1(x)
x = residual + x x = residual + x

@ -138,7 +138,7 @@ class Pitch():
input: np.ndarray, input: np.ndarray,
use_continuous_f0: bool=True, use_continuous_f0: bool=True,
use_log_f0: bool=True) -> np.ndarray: use_log_f0: bool=True) -> np.ndarray:
input = input.astype(np.float) input = input.astype(np.float_)
frame_period = 1000 * self.hop_length / self.sr frame_period = 1000 * self.hop_length / self.sr
f0, timeaxis = pyworld.dio( f0, timeaxis = pyworld.dio(
input, input,

@ -36,7 +36,7 @@ def convert_dtype_to_np_dtype_(dtype):
elif dtype is core.VarDesc.VarType.FP16: elif dtype is core.VarDesc.VarType.FP16:
return np.float16 return np.float16
elif dtype is core.VarDesc.VarType.BOOL: elif dtype is core.VarDesc.VarType.BOOL:
return np.bool return np.bool_
elif dtype is core.VarDesc.VarType.INT32: elif dtype is core.VarDesc.VarType.INT32:
return np.int32 return np.int32
elif dtype is core.VarDesc.VarType.INT64: elif dtype is core.VarDesc.VarType.INT64:

Loading…
Cancel
Save