change CodeStyle, test=asr

pull/2755/head
yeyupiaoling 3 years ago
parent 2aa84571c0
commit 34acf5f970

@ -26,7 +26,10 @@ from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention", "RelPositionMultiHeadedAttention2"]
__all__ = [
"MultiHeadedAttention", "RelPositionMultiHeadedAttention",
"RelPositionMultiHeadedAttention2"
]
# Relative Positional Encodings
# https://www.jianshu.com/p/c0608efcc26f
@ -341,7 +344,13 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
dropout_rate (float): Dropout rate.
"""
def __init__(self, n_head, n_feat, dropout_rate, do_rel_shift=False, adaptive_scale=False, init_weights=False):
def __init__(self,
n_head,
n_feat,
dropout_rate,
do_rel_shift=False,
adaptive_scale=False,
init_weights=False):
"""Construct an RelPositionMultiHeadedAttention object."""
super().__init__(n_head, n_feat, dropout_rate)
# linear transformation for positional encoding
@ -349,30 +358,44 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
self.do_rel_shift = do_rel_shift
pos_bias_u = self.create_parameter([self.h, self.d_k], default_initializer=I.XavierUniform())
pos_bias_u = self.create_parameter(
[self.h, self.d_k], default_initializer=I.XavierUniform())
self.add_parameter('pos_bias_u', pos_bias_u)
pos_bias_v = self.create_parameter([self.h, self.d_k], default_initializer=I.XavierUniform())
pos_bias_v = self.create_parameter(
[self.h, self.d_k], default_initializer=I.XavierUniform())
self.add_parameter('pos_bias_v', pos_bias_v)
self.adaptive_scale = adaptive_scale
ada_scale = self.create_parameter([1, 1, n_feat], default_initializer=I.Constant(1.0))
ada_scale = self.create_parameter(
[1, 1, n_feat], default_initializer=I.Constant(1.0))
self.add_parameter('ada_scale', ada_scale)
ada_bias = self.create_parameter([1, 1, n_feat], default_initializer=I.Constant(0.0))
ada_bias = self.create_parameter(
[1, 1, n_feat], default_initializer=I.Constant(0.0))
self.add_parameter('ada_bias', ada_bias)
if init_weights:
self.init_weights()
def init_weights(self):
input_max = (self.h * self.d_k)**-0.5
self.linear_q._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_q._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_k._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_k._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_v._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_v._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_pos._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_pos._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_out._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_out._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max)
self.linear_q._param_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_q._bias_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_k._param_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_k._bias_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_v._param_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_v._bias_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_pos._param_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_pos._bias_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_out._param_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_out._bias_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
def rel_shift(self, x, zero_triu: bool=False):
"""Compute relative positinal encoding.
@ -383,10 +406,12 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
Returns:
paddle.Tensor: Output tensor. (batch, head, time1, time1)
"""
zero_pad = paddle.zeros([x.shape[0], x.shape[1], x.shape[2], 1], dtype=x.dtype)
zero_pad = paddle.zeros(
[x.shape[0], x.shape[1], x.shape[2], 1], dtype=x.dtype)
x_padded = paddle.concat([zero_pad, x], axis=-1)
x_padded = x_padded.reshape([x.shape[0], x.shape[1], x.shape[3] + 1, x.shape[2]])
x_padded = x_padded.reshape(
[x.shape[0], x.shape[1], x.shape[3] + 1, x.shape[2]])
x = x_padded[:, :, 1:].reshape(paddle.shape(x)) # [B, H, T1, T1]
if zero_triu:
@ -395,12 +420,14 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
return x
def forward(self, query: paddle.Tensor,
key: paddle.Tensor, value: paddle.Tensor,
def forward(self,
query: paddle.Tensor,
key: paddle.Tensor,
value: paddle.Tensor,
mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
pos_emb: paddle.Tensor=paddle.empty([0]),
cache: paddle.Tensor = paddle.zeros((0, 0, 0, 0))
) -> Tuple[paddle.Tensor, paddle.Tensor]:
cache: paddle.Tensor=paddle.zeros(
(0, 0, 0, 0))) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args:
query (paddle.Tensor): Query tensor (#batch, time1, size).
@ -434,7 +461,8 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
new_cache = paddle.concat((k, v), axis=-1)
n_batch_pos = pos_emb.shape[0]
p = self.linear_pos(pos_emb).reshape([n_batch_pos, -1, self.h, self.d_k])
p = self.linear_pos(pos_emb).reshape(
[n_batch_pos, -1, self.h, self.d_k])
p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k)
# (batch, head, time1, d_k)
@ -460,6 +488,7 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
if self.do_rel_shift:
matrix_bd = self.rel_shift(matrix_bd)
scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k) # (batch, head, time1, time2)
scores = (matrix_ac + matrix_bd) / math.sqrt(
self.d_k) # (batch, head, time1, time2)
return self.forward_attention(v, scores, mask), new_cache

@ -1,4 +1,5 @@
from typing import Union, Optional
from typing import Optional
from typing import Union
import paddle
import paddle.nn.functional as F
@ -12,8 +13,7 @@ class Conv2DValid(_ConvNd):
Conv2d operator for VALID mode padding.
"""
def __init__(
self,
def __init__(self,
in_channels: int,
out_channels: int,
kernel_size: int,
@ -26,9 +26,9 @@ class Conv2DValid(_ConvNd):
bias_attr=None,
data_format="NCHW",
valid_trigx: bool=False,
valid_trigy: bool = False
) -> None:
super(Conv2DValid, self).__init__(in_channels,
valid_trigy: bool=False) -> None:
super(Conv2DValid, self).__init__(
in_channels,
out_channels,
kernel_size,
False,
@ -44,13 +44,19 @@ class Conv2DValid(_ConvNd):
self.valid_trigx = valid_trigx
self.valid_trigy = valid_trigy
def _conv_forward(self, input: paddle.Tensor, weight: paddle.Tensor, bias: Optional[paddle.Tensor]):
def _conv_forward(self,
input: paddle.Tensor,
weight: paddle.Tensor,
bias: Optional[paddle.Tensor]):
validx, validy = 0, 0
if self.valid_trigx:
validx = (input.shape[-2] * (self._stride[-2] - 1) - 1 + self._kernel_size[-2]) // 2
validx = (input.shape[-2] *
(self._stride[-2] - 1) - 1 + self._kernel_size[-2]) // 2
if self.valid_trigy:
validy = (input.shape[-1] * (self._stride[-1] - 1) - 1 + self._kernel_size[-1]) // 2
return F.conv2d(input, weight, bias, self._stride, (validx, validy), self._dilation, self._groups)
validy = (input.shape[-1] *
(self._stride[-1] - 1) - 1 + self._kernel_size[-1]) // 2
return F.conv2d(input, weight, bias, self._stride, (validx, validy),
self._dilation, self._groups)
def forward(self, input: paddle.Tensor) -> paddle.Tensor:
return self._conv_forward(input, self.weight, self.bias)

@ -35,9 +35,11 @@ class ConvolutionModule2(nn.Layer):
self.channels = channels
self.kernel_size = kernel_size
self.adaptive_scale = adaptive_scale
ada_scale = self.create_parameter([1, 1, channels], default_initializer=I.Constant(1.0))
ada_scale = self.create_parameter(
[1, 1, channels], default_initializer=I.Constant(1.0))
self.add_parameter('ada_scale', ada_scale)
ada_bias = self.create_parameter([1, 1, channels], default_initializer=I.Constant(0.0))
ada_bias = self.create_parameter(
[1, 1, channels], default_initializer=I.Constant(0.0))
self.add_parameter('ada_bias', ada_bias)
self.pointwise_conv1 = Conv1D(
@ -98,15 +100,21 @@ class ConvolutionModule2(nn.Layer):
def init_weights(self):
pw_max = self.channels**-0.5
dw_max = self.kernel_size**-0.5
self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
if self.bias:
self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max)
self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform(
low=-dw_max, high=dw_max)
if self.bias:
self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max)
self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform(
low=-dw_max, high=dw_max)
self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
if self.bias:
self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
def forward(
self,
@ -137,7 +145,8 @@ class ConvolutionModule2(nn.Layer):
if self.lorder > 0:
if cache.shape[2] == 0: # cache_t == 0
x = nn.functional.pad(x, [self.lorder, 0], 'constant', 0.0, data_format='NCL')
x = nn.functional.pad(
x, [self.lorder, 0], 'constant', 0.0, data_format='NCL')
else:
assert cache.shape[0] == x.shape[0] # B
assert cache.shape[1] == x.shape[1] # C

@ -14,36 +14,49 @@
# limitations under the License.
# Modified from wenet(https://github.com/wenet-e2e/wenet)
"""Encoder definition."""
from typing import Tuple, Union, Optional, List
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union
import paddle
from paddle import nn
from typeguard import check_argument_types
from paddlespeech.s2t.modules.activation import get_activation
from paddlespeech.s2t.modules.align import LayerNorm, Linear
from paddlespeech.s2t.modules.attention import MultiHeadedAttention, RelPositionMultiHeadedAttention2
from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.attention import MultiHeadedAttention
from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention
from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention2
from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule
from paddlespeech.s2t.modules.convolution import ConvolutionModule2
from paddlespeech.s2t.modules.embedding import NoPositionalEncoding
from paddlespeech.s2t.modules.embedding import PositionalEncoding
from paddlespeech.s2t.modules.embedding import RelPositionalEncoding
from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer, SqueezeformerEncoderLayer
from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer
from paddlespeech.s2t.modules.encoder_layer import SqueezeformerEncoderLayer
from paddlespeech.s2t.modules.encoder_layer import TransformerEncoderLayer
from paddlespeech.s2t.modules.mask import add_optional_chunk_mask
from paddlespeech.s2t.modules.mask import make_non_pad_mask
from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward, PositionwiseFeedForward2
from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4, TimeReductionLayerStream, TimeReductionLayer1D, \
DepthwiseConv2DSubsampling4, TimeReductionLayer2D
from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward2
from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling6
from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling8
from paddlespeech.s2t.modules.subsampling import DepthwiseConv2DSubsampling4
from paddlespeech.s2t.modules.subsampling import LinearNoSubsampling
from paddlespeech.s2t.modules.subsampling import TimeReductionLayer1D
from paddlespeech.s2t.modules.subsampling import TimeReductionLayer2D
from paddlespeech.s2t.modules.subsampling import TimeReductionLayerStream
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder", "SqueezeformerEncoder"]
__all__ = [
"BaseEncoder", 'TransformerEncoder', "ConformerEncoder",
"SqueezeformerEncoder"
]
class BaseEncoder(nn.Layer):
@ -492,8 +505,7 @@ class ConformerEncoder(BaseEncoder):
class SqueezeformerEncoder(nn.Layer):
def __init__(
self,
def __init__(self,
input_size: int,
encoder_dim: int=256,
output_size: int=256,
@ -521,8 +533,7 @@ class SqueezeformerEncoder(nn.Layer):
use_dynamic_chunk: bool=False,
concat_after: bool=False,
static_chunk_size: int=0,
use_dynamic_left_chunk: bool = False
):
use_dynamic_left_chunk: bool=False):
"""Construct SqueezeformerEncoder
Args:
@ -577,49 +588,40 @@ class SqueezeformerEncoder(nn.Layer):
# self-attention module definition
if pos_enc_layer_type != "rel_pos":
encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads,
output_size,
encoder_selfattn_layer_args = (attention_heads, output_size,
attention_dropout_rate)
else:
encoder_selfattn_layer = RelPositionMultiHeadedAttention2
encoder_selfattn_layer_args = (attention_heads,
encoder_dim,
attention_dropout_rate,
do_rel_shift,
adaptive_scale,
init_weights)
encoder_selfattn_layer_args = (attention_heads, encoder_dim,
attention_dropout_rate, do_rel_shift,
adaptive_scale, init_weights)
# feed-forward module definition
positionwise_layer = PositionwiseFeedForward2
positionwise_layer_args = (encoder_dim,
encoder_dim * feed_forward_expansion_factor,
feed_forward_dropout_rate,
activation,
adaptive_scale,
init_weights)
positionwise_layer_args = (
encoder_dim, encoder_dim * feed_forward_expansion_factor,
feed_forward_dropout_rate, activation, adaptive_scale, init_weights)
# convolution module definition
convolution_layer = ConvolutionModule2
convolution_layer_args = (encoder_dim, cnn_module_kernel, activation,
cnn_norm_type, causal, True, adaptive_scale, init_weights)
self.embed = DepthwiseConv2DSubsampling4(1, encoder_dim,
RelPositionalEncoding(encoder_dim, dropout_rate=0.1),
dw_stride,
input_size,
input_dropout_rate,
cnn_norm_type, causal, True, adaptive_scale,
init_weights)
self.embed = DepthwiseConv2DSubsampling4(
1, encoder_dim,
RelPositionalEncoding(encoder_dim, dropout_rate=0.1), dw_stride,
input_size, input_dropout_rate, init_weights)
self.preln = LayerNorm(encoder_dim)
self.encoders = paddle.nn.LayerList([SqueezeformerEncoderLayer(
self.encoders = paddle.nn.LayerList([
SqueezeformerEncoderLayer(
encoder_dim,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
convolution_layer(*convolution_layer_args),
positionwise_layer(*positionwise_layer_args),
normalize_before,
dropout,
concat_after) for _ in range(num_blocks)
positionwise_layer(*positionwise_layer_args), normalize_before,
dropout, concat_after) for _ in range(num_blocks)
])
if time_reduction_layer_type == 'conv1d':
time_reduction_layer = TimeReductionLayer1D
@ -637,7 +639,8 @@ class SqueezeformerEncoder(nn.Layer):
time_reduction_layer = TimeReductionLayer2D
time_reduction_layer_args = {'encoder_dim': encoder_dim}
self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args)
self.time_reduction_layer = time_reduction_layer(
**time_reduction_layer_args)
self.time_recover_layer = Linear(encoder_dim, encoder_dim)
self.final_proj = None
if output_size != encoder_dim:
@ -674,11 +677,9 @@ class SqueezeformerEncoder(nn.Layer):
xs = self.global_cmvn(xs)
xs, pos_emb, masks = self.embed(xs, masks)
mask_pad = ~masks
chunk_masks = add_optional_chunk_mask(xs, masks,
self.use_dynamic_chunk,
self.use_dynamic_left_chunk,
decoding_chunk_size,
self.static_chunk_size,
chunk_masks = add_optional_chunk_mask(
xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
decoding_chunk_size, self.static_chunk_size,
num_decoding_left_chunks)
xs_lens = chunk_masks.squeeze(1).sum(1)
xs = self.preln(xs)
@ -688,15 +689,18 @@ class SqueezeformerEncoder(nn.Layer):
for i, layer in enumerate(self.encoders):
if self.reduce_idx is not None:
if self.time_reduce is not None and i in self.reduce_idx:
recover_activations.append((xs, chunk_masks, pos_emb, mask_pad))
xs, xs_lens, chunk_masks, mask_pad = self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad)
recover_activations.append(
(xs, chunk_masks, pos_emb, mask_pad))
xs, xs_lens, chunk_masks, mask_pad = self.time_reduction_layer(
xs, xs_lens, chunk_masks, mask_pad)
pos_emb = pos_emb[:, ::2, :]
index += 1
if self.recover_idx is not None:
if self.time_reduce == 'recover' and i in self.recover_idx:
index -= 1
recover_tensor, recover_chunk_masks, recover_pos_emb, recover_mask_pad = recover_activations[index]
recover_tensor, recover_chunk_masks, recover_pos_emb, recover_mask_pad = recover_activations[
index]
# recover output length for ctc decode
xs = paddle.repeat_interleave(xs, repeats=2, axis=1)
xs = self.time_recover_layer(xs)
@ -786,7 +790,8 @@ class SqueezeformerEncoder(nn.Layer):
elayers, cache_t1 = att_cache.shape[0], att_cache.shape[2]
chunk_size = xs.shape[1]
attention_key_size = cache_t1 + chunk_size
pos_emb = self.embed.position_encoding(offset=offset - cache_t1, size=attention_key_size)
pos_emb = self.embed.position_encoding(
offset=offset - cache_t1, size=attention_key_size)
if required_cache_size < 0:
next_cache_start = 0
elif required_cache_size == 0:
@ -811,15 +816,18 @@ class SqueezeformerEncoder(nn.Layer):
# shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2)
if self.reduce_idx is not None:
if self.time_reduce is not None and i in self.reduce_idx:
recover_activations.append((xs, att_mask, pos_emb, mask_pad))
xs, xs_lens, att_mask, mask_pad = self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad)
recover_activations.append(
(xs, att_mask, pos_emb, mask_pad))
xs, xs_lens, att_mask, mask_pad = self.time_reduction_layer(
xs, xs_lens, att_mask, mask_pad)
pos_emb = pos_emb[:, ::2, :]
index += 1
if self.recover_idx is not None:
if self.time_reduce == 'recover' and i in self.recover_idx:
index -= 1
recover_tensor, recover_att_mask, recover_pos_emb, recover_mask_pad = recover_activations[index]
recover_tensor, recover_att_mask, recover_pos_emb, recover_mask_pad = recover_activations[
index]
# recover output length for ctc decode
xs = paddle.repeat_interleave(xs, repeats=2, axis=1)
xs = self.time_recover_layer(xs)
@ -830,7 +838,9 @@ class SqueezeformerEncoder(nn.Layer):
mask_pad = recover_mask_pad
factor = self.calculate_downsampling_factor(i)
att_cache1 = att_cache[i:i + 1][:, :, ::factor, :][:, :, :pos_emb.shape[1] - xs.shape[1], :]
att_cache1 = att_cache[
i:i + 1][:, :, ::factor, :][:, :, :pos_emb.shape[1] - xs.shape[
1], :]
cnn_cache1 = cnn_cache[i] if cnn_cache.shape[0] > 0 else cnn_cache
xs, _, new_att_cache, new_cnn_cache = layer(
xs,

@ -26,7 +26,10 @@ from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ["TransformerEncoderLayer", "ConformerEncoderLayer", "SqueezeformerEncoderLayer"]
__all__ = [
"TransformerEncoderLayer", "ConformerEncoderLayer",
"SqueezeformerEncoderLayer"
]
class TransformerEncoderLayer(nn.Layer):
@ -281,8 +284,7 @@ class ConformerEncoderLayer(nn.Layer):
class SqueezeformerEncoderLayer(nn.Layer):
"""Encoder layer module."""
def __init__(
self,
def __init__(self,
size: int,
self_attn: paddle.nn.Layer,
feed_forward1: Optional[nn.Layer]=None,

@ -16,8 +16,8 @@
"""Positionwise feed forward layer definition."""
import paddle
from paddle import nn
from paddle.nn import initializer as I
from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.utils.log import Log
@ -90,9 +90,11 @@ class PositionwiseFeedForward2(paddle.nn.Layer):
self.dropout = paddle.nn.Dropout(dropout_rate)
self.w_2 = Linear(hidden_units, idim)
self.adaptive_scale = adaptive_scale
ada_scale = self.create_parameter([1, 1, idim], default_initializer=I.XavierUniform())
ada_scale = self.create_parameter(
[1, 1, idim], default_initializer=I.XavierUniform())
self.add_parameter('ada_scale', ada_scale)
ada_bias = self.create_parameter([1, 1, idim], default_initializer=I.XavierUniform())
ada_bias = self.create_parameter(
[1, 1, idim], default_initializer=I.XavierUniform())
self.add_parameter('ada_bias', ada_bias)
if init_weights:
@ -101,10 +103,14 @@ class PositionwiseFeedForward2(paddle.nn.Layer):
def init_weights(self):
ffn1_max = self.idim**-0.5
ffn2_max = self.hidden_units**-0.5
self.w_1._param_attr = paddle.nn.initializer.Uniform(low=-ffn1_max, high=ffn1_max)
self.w_1._bias_attr = paddle.nn.initializer.Uniform(low=-ffn1_max, high=ffn1_max)
self.w_2._param_attr = paddle.nn.initializer.Uniform(low=-ffn2_max, high=ffn2_max)
self.w_2._bias_attr = paddle.nn.initializer.Uniform(low=-ffn2_max, high=ffn2_max)
self.w_1._param_attr = paddle.nn.initializer.Uniform(
low=-ffn1_max, high=ffn1_max)
self.w_1._bias_attr = paddle.nn.initializer.Uniform(
low=-ffn1_max, high=ffn1_max)
self.w_2._param_attr = paddle.nn.initializer.Uniform(
low=-ffn2_max, high=ffn2_max)
self.w_2._bias_attr = paddle.nn.initializer.Uniform(
low=-ffn2_max, high=ffn2_max)
def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
"""Forward function.

@ -21,7 +21,8 @@ import paddle.nn.functional as F
from paddle import nn
from paddlespeech.s2t import masked_fill
from paddlespeech.s2t.modules.align import Conv2D, Conv1D
from paddlespeech.s2t.modules.align import Conv1D
from paddlespeech.s2t.modules.align import Conv2D
from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.conv2d import Conv2DValid
@ -267,8 +268,9 @@ class DepthwiseConv2DSubsampling4(BaseSubsampling):
"""
def __init__(
self, idim: int, odim: int,
def __init__(self,
idim: int,
odim: int,
pos_enc_class: nn.Layer,
dw_stride: bool=False,
input_size: int=80,
@ -277,9 +279,14 @@ class DepthwiseConv2DSubsampling4(BaseSubsampling):
super(DepthwiseConv2DSubsampling4, self).__init__()
self.idim = idim
self.odim = odim
self.pw_conv = Conv2D(in_channels=idim, out_channels=odim, kernel_size=3, stride=2)
self.pw_conv = Conv2D(
in_channels=idim, out_channels=odim, kernel_size=3, stride=2)
self.act1 = nn.ReLU()
self.dw_conv = Conv2D(in_channels=odim, out_channels=odim, kernel_size=3, stride=2,
self.dw_conv = Conv2D(
in_channels=odim,
out_channels=odim,
kernel_size=3,
stride=2,
groups=odim if dw_stride else 1)
self.act2 = nn.ReLU()
self.pos_enc = pos_enc_class
@ -288,18 +295,18 @@ class DepthwiseConv2DSubsampling4(BaseSubsampling):
nn.Dropout(p=input_dropout_rate))
if init_weights:
linear_max = (odim * input_size / 4)**-0.5
self.input_proj.state_dict()['0.weight'] = paddle.nn.initializer.Uniform(low=-linear_max, high=linear_max)
self.input_proj.state_dict()['0.bias'] = paddle.nn.initializer.Uniform(low=-linear_max, high=linear_max)
self.input_proj.state_dict()[
'0.weight'] = paddle.nn.initializer.Uniform(
low=-linear_max, high=linear_max)
self.input_proj.state_dict()[
'0.bias'] = paddle.nn.initializer.Uniform(
low=-linear_max, high=linear_max)
self.subsampling_rate = 4
# 6 = (3 - 1) * 1 + (3 - 1) * 2
self.right_context = 6
def forward(
self,
x: paddle.Tensor,
x_mask: paddle.Tensor,
offset: int = 0
def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
x = x.unsqueeze(1) # (b, c=1, t, f)
x = self.pw_conv(x)
@ -327,7 +334,11 @@ class TimeReductionLayer1D(nn.Layer):
stride (int): Downsampling factor in time dimension.
"""
def __init__(self, channel: int, out_dim: int, kernel_size: int = 5, stride: int = 2):
def __init__(self,
channel: int,
out_dim: int,
kernel_size: int=5,
stride: int=2):
super(TimeReductionLayer1D, self).__init__()
self.channel = channel
@ -342,28 +353,37 @@ class TimeReductionLayer1D(nn.Layer):
kernel_size=kernel_size,
stride=stride,
padding=self.padding,
groups=channel,
)
groups=channel, )
self.pw_conv = Conv1D(
in_channels=channel, out_channels=out_dim,
kernel_size=1, stride=1, padding=0, groups=1,
)
in_channels=channel,
out_channels=out_dim,
kernel_size=1,
stride=1,
padding=0,
groups=1, )
self.init_weights()
def init_weights(self):
dw_max = self.kernel_size**-0.5
pw_max = self.channel**-0.5
self.dw_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max)
self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
low=-dw_max, high=dw_max)
self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
def forward(self, xs, xs_lens: paddle.Tensor,
def forward(
self,
xs,
xs_lens: paddle.Tensor,
mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
mask_pad: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool),
):
mask_pad: paddle.Tensor=paddle.ones((0, 0, 0),
dtype=paddle.bool), ):
xs = xs.transpose([0, 2, 1]) # [B, C, T]
xs = masked_fill(xs, mask_pad.equal(0), 0.0)
@ -392,12 +412,14 @@ class TimeReductionLayer2D(nn.Layer):
super(TimeReductionLayer2D, self).__init__()
self.encoder_dim = encoder_dim
self.kernel_size = kernel_size
self.dw_conv = Conv2DValid(in_channels=encoder_dim,
self.dw_conv = Conv2DValid(
in_channels=encoder_dim,
out_channels=encoder_dim,
kernel_size=(kernel_size, 1),
stride=stride,
valid_trigy=True)
self.pw_conv = Conv2DValid(in_channels=encoder_dim,
self.pw_conv = Conv2DValid(
in_channels=encoder_dim,
out_channels=encoder_dim,
kernel_size=1,
stride=1,
@ -411,27 +433,35 @@ class TimeReductionLayer2D(nn.Layer):
def init_weights(self):
dw_max = self.kernel_size**-0.5
pw_max = self.encoder_dim**-0.5
self.dw_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max)
self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
low=-dw_max, high=dw_max)
self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
def forward(
self, xs: paddle.Tensor, xs_lens: paddle.Tensor,
self,
xs: paddle.Tensor,
xs_lens: paddle.Tensor,
mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
mask_pad: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
xs = masked_fill(xs, mask_pad.transpose([0, 2, 1]).equal(0), 0.0)
xs = xs.unsqueeze(1)
padding1 = self.kernel_size - self.stride
xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), mode='constant', value=0.)
xs = F.pad(
xs, (0, 0, 0, 0, 0, padding1, 0, 0), mode='constant', value=0.)
xs = self.dw_conv(xs.transpose([0, 3, 2, 1]))
xs = self.pw_conv(xs).transpose([0, 3, 2, 1]).squeeze(1)
tmp_length = xs.shape[1]
xs_lens = (xs_lens + 1) // 2
padding2 = max(0, (xs_lens.max() - tmp_length).item())
batch_size, hidden = xs.shape[0], xs.shape[-1]
dummy_pad = paddle.zeros([batch_size, padding2, hidden], dtype=paddle.float32)
dummy_pad = paddle.zeros(
[batch_size, padding2, hidden], dtype=paddle.float32)
xs = paddle.concat([xs, dummy_pad], axis=1)
mask = mask[:, ::2, ::2]
mask_pad = mask_pad[:, :, ::2]
@ -451,8 +481,11 @@ class TimeReductionLayerStream(nn.Layer):
stride (int): Downsampling factor in time dimension.
"""
def __init__(self, channel: int, out_dim: int,
kernel_size: int = 1, stride: int = 2):
def __init__(self,
channel: int,
out_dim: int,
kernel_size: int=1,
stride: int=2):
super(TimeReductionLayerStream, self).__init__()
self.channel = channel
@ -460,14 +493,16 @@ class TimeReductionLayerStream(nn.Layer):
self.kernel_size = kernel_size
self.stride = stride
self.dw_conv = Conv1D(in_channels=channel,
self.dw_conv = Conv1D(
in_channels=channel,
out_channels=channel,
kernel_size=kernel_size,
stride=stride,
padding=0,
groups=channel)
self.pw_conv = Conv1D(in_channels=channel,
self.pw_conv = Conv1D(
in_channels=channel,
out_channels=out_dim,
kernel_size=1,
stride=1,
@ -478,12 +513,19 @@ class TimeReductionLayerStream(nn.Layer):
def init_weights(self):
dw_max = self.kernel_size**-0.5
pw_max = self.channel**-0.5
self.dw_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max)
self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max)
self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
low=-dw_max, high=dw_max)
self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
def forward(self, xs, xs_lens: paddle.Tensor,
def forward(
self,
xs,
xs_lens: paddle.Tensor,
mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)):
xs = xs.transpose([0, 2, 1]) # [B, C, T]

Loading…
Cancel
Save