change CodeStyle, test=asr

pull/2755/head
yeyupiaoling 3 years ago
parent 2aa84571c0
commit 34acf5f970

@ -26,7 +26,10 @@ from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention", "RelPositionMultiHeadedAttention2"] __all__ = [
"MultiHeadedAttention", "RelPositionMultiHeadedAttention",
"RelPositionMultiHeadedAttention2"
]
# Relative Positional Encodings # Relative Positional Encodings
# https://www.jianshu.com/p/c0608efcc26f # https://www.jianshu.com/p/c0608efcc26f
@ -341,7 +344,13 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
dropout_rate (float): Dropout rate. dropout_rate (float): Dropout rate.
""" """
def __init__(self, n_head, n_feat, dropout_rate, do_rel_shift=False, adaptive_scale=False, init_weights=False): def __init__(self,
n_head,
n_feat,
dropout_rate,
do_rel_shift=False,
adaptive_scale=False,
init_weights=False):
"""Construct an RelPositionMultiHeadedAttention object.""" """Construct an RelPositionMultiHeadedAttention object."""
super().__init__(n_head, n_feat, dropout_rate) super().__init__(n_head, n_feat, dropout_rate)
# linear transformation for positional encoding # linear transformation for positional encoding
@ -349,32 +358,46 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
# these two learnable bias are used in matrix c and matrix d # these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3 # as described in https://arxiv.org/abs/1901.02860 Section 3.3
self.do_rel_shift = do_rel_shift self.do_rel_shift = do_rel_shift
pos_bias_u = self.create_parameter([self.h, self.d_k], default_initializer=I.XavierUniform()) pos_bias_u = self.create_parameter(
[self.h, self.d_k], default_initializer=I.XavierUniform())
self.add_parameter('pos_bias_u', pos_bias_u) self.add_parameter('pos_bias_u', pos_bias_u)
pos_bias_v = self.create_parameter([self.h, self.d_k], default_initializer=I.XavierUniform()) pos_bias_v = self.create_parameter(
[self.h, self.d_k], default_initializer=I.XavierUniform())
self.add_parameter('pos_bias_v', pos_bias_v) self.add_parameter('pos_bias_v', pos_bias_v)
self.adaptive_scale = adaptive_scale self.adaptive_scale = adaptive_scale
ada_scale = self.create_parameter([1, 1, n_feat], default_initializer=I.Constant(1.0)) ada_scale = self.create_parameter(
[1, 1, n_feat], default_initializer=I.Constant(1.0))
self.add_parameter('ada_scale', ada_scale) self.add_parameter('ada_scale', ada_scale)
ada_bias = self.create_parameter([1, 1, n_feat], default_initializer=I.Constant(0.0)) ada_bias = self.create_parameter(
[1, 1, n_feat], default_initializer=I.Constant(0.0))
self.add_parameter('ada_bias', ada_bias) self.add_parameter('ada_bias', ada_bias)
if init_weights: if init_weights:
self.init_weights() self.init_weights()
def init_weights(self): def init_weights(self):
input_max = (self.h * self.d_k) ** -0.5 input_max = (self.h * self.d_k)**-0.5
self.linear_q._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) self.linear_q._param_attr = paddle.nn.initializer.Uniform(
self.linear_q._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) low=-input_max, high=input_max)
self.linear_k._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) self.linear_q._bias_attr = paddle.nn.initializer.Uniform(
self.linear_k._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) low=-input_max, high=input_max)
self.linear_v._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) self.linear_k._param_attr = paddle.nn.initializer.Uniform(
self.linear_v._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) low=-input_max, high=input_max)
self.linear_pos._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) self.linear_k._bias_attr = paddle.nn.initializer.Uniform(
self.linear_pos._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) low=-input_max, high=input_max)
self.linear_out._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) self.linear_v._param_attr = paddle.nn.initializer.Uniform(
self.linear_out._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) low=-input_max, high=input_max)
self.linear_v._bias_attr = paddle.nn.initializer.Uniform(
def rel_shift(self, x, zero_triu: bool = False): low=-input_max, high=input_max)
self.linear_pos._param_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_pos._bias_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_out._param_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
self.linear_out._bias_attr = paddle.nn.initializer.Uniform(
low=-input_max, high=input_max)
def rel_shift(self, x, zero_triu: bool=False):
"""Compute relative positinal encoding. """Compute relative positinal encoding.
Args: Args:
x (paddle.Tensor): Input tensor (batch, head, time1, time1). x (paddle.Tensor): Input tensor (batch, head, time1, time1).
@ -383,10 +406,12 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
Returns: Returns:
paddle.Tensor: Output tensor. (batch, head, time1, time1) paddle.Tensor: Output tensor. (batch, head, time1, time1)
""" """
zero_pad = paddle.zeros([x.shape[0], x.shape[1], x.shape[2], 1], dtype=x.dtype) zero_pad = paddle.zeros(
[x.shape[0], x.shape[1], x.shape[2], 1], dtype=x.dtype)
x_padded = paddle.concat([zero_pad, x], axis=-1) x_padded = paddle.concat([zero_pad, x], axis=-1)
x_padded = x_padded.reshape([x.shape[0], x.shape[1], x.shape[3] + 1, x.shape[2]]) x_padded = x_padded.reshape(
[x.shape[0], x.shape[1], x.shape[3] + 1, x.shape[2]])
x = x_padded[:, :, 1:].reshape(paddle.shape(x)) # [B, H, T1, T1] x = x_padded[:, :, 1:].reshape(paddle.shape(x)) # [B, H, T1, T1]
if zero_triu: if zero_triu:
@ -395,12 +420,14 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
return x return x
def forward(self, query: paddle.Tensor, def forward(self,
key: paddle.Tensor, value: paddle.Tensor, query: paddle.Tensor,
mask: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), key: paddle.Tensor,
pos_emb: paddle.Tensor = paddle.empty([0]), value: paddle.Tensor,
cache: paddle.Tensor = paddle.zeros((0, 0, 0, 0)) mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
) -> Tuple[paddle.Tensor, paddle.Tensor]: pos_emb: paddle.Tensor=paddle.empty([0]),
cache: paddle.Tensor=paddle.zeros(
(0, 0, 0, 0))) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding. """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args: Args:
query (paddle.Tensor): Query tensor (#batch, time1, size). query (paddle.Tensor): Query tensor (#batch, time1, size).
@ -434,7 +461,8 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
new_cache = paddle.concat((k, v), axis=-1) new_cache = paddle.concat((k, v), axis=-1)
n_batch_pos = pos_emb.shape[0] n_batch_pos = pos_emb.shape[0]
p = self.linear_pos(pos_emb).reshape([n_batch_pos, -1, self.h, self.d_k]) p = self.linear_pos(pos_emb).reshape(
[n_batch_pos, -1, self.h, self.d_k])
p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k)
# (batch, head, time1, d_k) # (batch, head, time1, d_k)
@ -460,6 +488,7 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention):
if self.do_rel_shift: if self.do_rel_shift:
matrix_bd = self.rel_shift(matrix_bd) matrix_bd = self.rel_shift(matrix_bd)
scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k) # (batch, head, time1, time2) scores = (matrix_ac + matrix_bd) / math.sqrt(
self.d_k) # (batch, head, time1, time2)
return self.forward_attention(v, scores, mask), new_cache return self.forward_attention(v, scores, mask), new_cache

@ -1,4 +1,5 @@
from typing import Union, Optional from typing import Optional
from typing import Union
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
@ -12,45 +13,50 @@ class Conv2DValid(_ConvNd):
Conv2d operator for VALID mode padding. Conv2d operator for VALID mode padding.
""" """
def __init__( def __init__(self,
self, in_channels: int,
in_channels: int, out_channels: int,
out_channels: int, kernel_size: int,
kernel_size: int, stride: int=1,
stride: int = 1, padding: Union[str, int]=0,
padding: Union[str, int] = 0, dilation: int=1,
dilation: int = 1, groups: int=1,
groups: int = 1, padding_mode: str='zeros',
padding_mode: str = 'zeros', weight_attr=None,
weight_attr=None, bias_attr=None,
bias_attr=None, data_format="NCHW",
data_format="NCHW", valid_trigx: bool=False,
valid_trigx: bool = False, valid_trigy: bool=False) -> None:
valid_trigy: bool = False super(Conv2DValid, self).__init__(
) -> None: in_channels,
super(Conv2DValid, self).__init__(in_channels, out_channels,
out_channels, kernel_size,
kernel_size, False,
False, 2,
2, stride=stride,
stride=stride, padding=padding,
padding=padding, padding_mode=padding_mode,
padding_mode=padding_mode, dilation=dilation,
dilation=dilation, groups=groups,
groups=groups, weight_attr=weight_attr,
weight_attr=weight_attr, bias_attr=bias_attr,
bias_attr=bias_attr, data_format=data_format)
data_format=data_format)
self.valid_trigx = valid_trigx self.valid_trigx = valid_trigx
self.valid_trigy = valid_trigy self.valid_trigy = valid_trigy
def _conv_forward(self, input: paddle.Tensor, weight: paddle.Tensor, bias: Optional[paddle.Tensor]): def _conv_forward(self,
input: paddle.Tensor,
weight: paddle.Tensor,
bias: Optional[paddle.Tensor]):
validx, validy = 0, 0 validx, validy = 0, 0
if self.valid_trigx: if self.valid_trigx:
validx = (input.shape[-2] * (self._stride[-2] - 1) - 1 + self._kernel_size[-2]) // 2 validx = (input.shape[-2] *
(self._stride[-2] - 1) - 1 + self._kernel_size[-2]) // 2
if self.valid_trigy: if self.valid_trigy:
validy = (input.shape[-1] * (self._stride[-1] - 1) - 1 + self._kernel_size[-1]) // 2 validy = (input.shape[-1] *
return F.conv2d(input, weight, bias, self._stride, (validx, validy), self._dilation, self._groups) (self._stride[-1] - 1) - 1 + self._kernel_size[-1]) // 2
return F.conv2d(input, weight, bias, self._stride, (validx, validy),
self._dilation, self._groups)
def forward(self, input: paddle.Tensor) -> paddle.Tensor: def forward(self, input: paddle.Tensor) -> paddle.Tensor:
return self._conv_forward(input, self.weight, self.bias) return self._conv_forward(input, self.weight, self.bias)

@ -16,13 +16,13 @@ class ConvolutionModule2(nn.Layer):
def __init__(self, def __init__(self,
channels: int, channels: int,
kernel_size: int = 15, kernel_size: int=15,
activation: nn.Layer = nn.ReLU(), activation: nn.Layer=nn.ReLU(),
norm: str = "batch_norm", norm: str="batch_norm",
causal: bool = False, causal: bool=False,
bias: bool = True, bias: bool=True,
adaptive_scale: bool = False, adaptive_scale: bool=False,
init_weights: bool = False): init_weights: bool=False):
"""Construct an ConvolutionModule object. """Construct an ConvolutionModule object.
Args: Args:
channels (int): The number of channels of conv layers. channels (int): The number of channels of conv layers.
@ -35,9 +35,11 @@ class ConvolutionModule2(nn.Layer):
self.channels = channels self.channels = channels
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.adaptive_scale = adaptive_scale self.adaptive_scale = adaptive_scale
ada_scale = self.create_parameter([1, 1, channels], default_initializer=I.Constant(1.0)) ada_scale = self.create_parameter(
[1, 1, channels], default_initializer=I.Constant(1.0))
self.add_parameter('ada_scale', ada_scale) self.add_parameter('ada_scale', ada_scale)
ada_bias = self.create_parameter([1, 1, channels], default_initializer=I.Constant(0.0)) ada_bias = self.create_parameter(
[1, 1, channels], default_initializer=I.Constant(0.0))
self.add_parameter('ada_bias', ada_bias) self.add_parameter('ada_bias', ada_bias)
self.pointwise_conv1 = Conv1D( self.pointwise_conv1 = Conv1D(
@ -96,23 +98,29 @@ class ConvolutionModule2(nn.Layer):
self.init_weights() self.init_weights()
def init_weights(self): def init_weights(self):
pw_max = self.channels ** -0.5 pw_max = self.channels**-0.5
dw_max = self.kernel_size ** -0.5 dw_max = self.kernel_size**-0.5
self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
if self.bias: if self.bias:
self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform(
self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) low=-pw_max, high=pw_max)
self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform(
low=-dw_max, high=dw_max)
if self.bias: if self.bias:
self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform(
self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) low=-dw_max, high=dw_max)
self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
if self.bias: if self.bias:
self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
def forward( def forward(
self, self,
x: paddle.Tensor, x: paddle.Tensor,
mask_pad: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
cache: paddle.Tensor = paddle.zeros([0, 0, 0]), cache: paddle.Tensor=paddle.zeros([0, 0, 0]),
) -> Tuple[paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute convolution module. """Compute convolution module.
Args: Args:
@ -137,7 +145,8 @@ class ConvolutionModule2(nn.Layer):
if self.lorder > 0: if self.lorder > 0:
if cache.shape[2] == 0: # cache_t == 0 if cache.shape[2] == 0: # cache_t == 0
x = nn.functional.pad(x, [self.lorder, 0], 'constant', 0.0, data_format='NCL') x = nn.functional.pad(
x, [self.lorder, 0], 'constant', 0.0, data_format='NCL')
else: else:
assert cache.shape[0] == x.shape[0] # B assert cache.shape[0] == x.shape[0] # B
assert cache.shape[1] == x.shape[1] # C assert cache.shape[1] == x.shape[1] # C

@ -14,36 +14,49 @@
# limitations under the License. # limitations under the License.
# Modified from wenet(https://github.com/wenet-e2e/wenet) # Modified from wenet(https://github.com/wenet-e2e/wenet)
"""Encoder definition.""" """Encoder definition."""
from typing import Tuple, Union, Optional, List from typing import List
from typing import Optional
from typing import Tuple
from typing import Union
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import check_argument_types
from paddlespeech.s2t.modules.activation import get_activation from paddlespeech.s2t.modules.activation import get_activation
from paddlespeech.s2t.modules.align import LayerNorm, Linear from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.modules.attention import MultiHeadedAttention, RelPositionMultiHeadedAttention2 from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.attention import MultiHeadedAttention
from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention
from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention2
from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule
from paddlespeech.s2t.modules.convolution import ConvolutionModule2 from paddlespeech.s2t.modules.convolution import ConvolutionModule2
from paddlespeech.s2t.modules.embedding import NoPositionalEncoding from paddlespeech.s2t.modules.embedding import NoPositionalEncoding
from paddlespeech.s2t.modules.embedding import PositionalEncoding from paddlespeech.s2t.modules.embedding import PositionalEncoding
from paddlespeech.s2t.modules.embedding import RelPositionalEncoding from paddlespeech.s2t.modules.embedding import RelPositionalEncoding
from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer, SqueezeformerEncoderLayer from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer
from paddlespeech.s2t.modules.encoder_layer import SqueezeformerEncoderLayer
from paddlespeech.s2t.modules.encoder_layer import TransformerEncoderLayer from paddlespeech.s2t.modules.encoder_layer import TransformerEncoderLayer
from paddlespeech.s2t.modules.mask import add_optional_chunk_mask from paddlespeech.s2t.modules.mask import add_optional_chunk_mask
from paddlespeech.s2t.modules.mask import make_non_pad_mask from paddlespeech.s2t.modules.mask import make_non_pad_mask
from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward, PositionwiseFeedForward2 from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4, TimeReductionLayerStream, TimeReductionLayer1D, \ from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward2
DepthwiseConv2DSubsampling4, TimeReductionLayer2D from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling6 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling6
from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling8 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling8
from paddlespeech.s2t.modules.subsampling import DepthwiseConv2DSubsampling4
from paddlespeech.s2t.modules.subsampling import LinearNoSubsampling from paddlespeech.s2t.modules.subsampling import LinearNoSubsampling
from paddlespeech.s2t.modules.subsampling import TimeReductionLayer1D
from paddlespeech.s2t.modules.subsampling import TimeReductionLayer2D
from paddlespeech.s2t.modules.subsampling import TimeReductionLayerStream
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder", "SqueezeformerEncoder"] __all__ = [
"BaseEncoder", 'TransformerEncoder', "ConformerEncoder",
"SqueezeformerEncoder"
]
class BaseEncoder(nn.Layer): class BaseEncoder(nn.Layer):
@ -492,37 +505,35 @@ class ConformerEncoder(BaseEncoder):
class SqueezeformerEncoder(nn.Layer): class SqueezeformerEncoder(nn.Layer):
def __init__( def __init__(self,
self, input_size: int,
input_size: int, encoder_dim: int=256,
encoder_dim: int = 256, output_size: int=256,
output_size: int = 256, attention_heads: int=4,
attention_heads: int = 4, num_blocks: int=12,
num_blocks: int = 12, reduce_idx: Optional[Union[int, List[int]]]=5,
reduce_idx: Optional[Union[int, List[int]]] = 5, recover_idx: Optional[Union[int, List[int]]]=11,
recover_idx: Optional[Union[int, List[int]]] = 11, feed_forward_expansion_factor: int=4,
feed_forward_expansion_factor: int = 4, dw_stride: bool=False,
dw_stride: bool = False, input_dropout_rate: float=0.1,
input_dropout_rate: float = 0.1, pos_enc_layer_type: str="rel_pos",
pos_enc_layer_type: str = "rel_pos", time_reduction_layer_type: str="conv1d",
time_reduction_layer_type: str = "conv1d", do_rel_shift: bool=True,
do_rel_shift: bool = True, feed_forward_dropout_rate: float=0.1,
feed_forward_dropout_rate: float = 0.1, attention_dropout_rate: float=0.1,
attention_dropout_rate: float = 0.1, cnn_module_kernel: int=31,
cnn_module_kernel: int = 31, cnn_norm_type: str="layer_norm",
cnn_norm_type: str = "layer_norm", dropout: float=0.1,
dropout: float = 0.1, causal: bool=False,
causal: bool = False, adaptive_scale: bool=True,
adaptive_scale: bool = True, activation_type: str="swish",
activation_type: str = "swish", init_weights: bool=True,
init_weights: bool = True, global_cmvn: paddle.nn.Layer=None,
global_cmvn: paddle.nn.Layer = None, normalize_before: bool=False,
normalize_before: bool = False, use_dynamic_chunk: bool=False,
use_dynamic_chunk: bool = False, concat_after: bool=False,
concat_after: bool = False, static_chunk_size: int=0,
static_chunk_size: int = 0, use_dynamic_left_chunk: bool=False):
use_dynamic_left_chunk: bool = False
):
"""Construct SqueezeformerEncoder """Construct SqueezeformerEncoder
Args: Args:
@ -577,49 +588,40 @@ class SqueezeformerEncoder(nn.Layer):
# self-attention module definition # self-attention module definition
if pos_enc_layer_type != "rel_pos": if pos_enc_layer_type != "rel_pos":
encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer = MultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, encoder_selfattn_layer_args = (attention_heads, output_size,
output_size,
attention_dropout_rate) attention_dropout_rate)
else: else:
encoder_selfattn_layer = RelPositionMultiHeadedAttention2 encoder_selfattn_layer = RelPositionMultiHeadedAttention2
encoder_selfattn_layer_args = (attention_heads, encoder_selfattn_layer_args = (attention_heads, encoder_dim,
encoder_dim, attention_dropout_rate, do_rel_shift,
attention_dropout_rate, adaptive_scale, init_weights)
do_rel_shift,
adaptive_scale,
init_weights)
# feed-forward module definition # feed-forward module definition
positionwise_layer = PositionwiseFeedForward2 positionwise_layer = PositionwiseFeedForward2
positionwise_layer_args = (encoder_dim, positionwise_layer_args = (
encoder_dim * feed_forward_expansion_factor, encoder_dim, encoder_dim * feed_forward_expansion_factor,
feed_forward_dropout_rate, feed_forward_dropout_rate, activation, adaptive_scale, init_weights)
activation,
adaptive_scale,
init_weights)
# convolution module definition # convolution module definition
convolution_layer = ConvolutionModule2 convolution_layer = ConvolutionModule2
convolution_layer_args = (encoder_dim, cnn_module_kernel, activation, convolution_layer_args = (encoder_dim, cnn_module_kernel, activation,
cnn_norm_type, causal, True, adaptive_scale, init_weights) cnn_norm_type, causal, True, adaptive_scale,
init_weights)
self.embed = DepthwiseConv2DSubsampling4(1, encoder_dim, self.embed = DepthwiseConv2DSubsampling4(
RelPositionalEncoding(encoder_dim, dropout_rate=0.1), 1, encoder_dim,
dw_stride, RelPositionalEncoding(encoder_dim, dropout_rate=0.1), dw_stride,
input_size, input_size, input_dropout_rate, init_weights)
input_dropout_rate,
init_weights)
self.preln = LayerNorm(encoder_dim) self.preln = LayerNorm(encoder_dim)
self.encoders = paddle.nn.LayerList([SqueezeformerEncoderLayer( self.encoders = paddle.nn.LayerList([
encoder_dim, SqueezeformerEncoderLayer(
encoder_selfattn_layer(*encoder_selfattn_layer_args), encoder_dim,
positionwise_layer(*positionwise_layer_args), encoder_selfattn_layer(*encoder_selfattn_layer_args),
convolution_layer(*convolution_layer_args), positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args), convolution_layer(*convolution_layer_args),
normalize_before, positionwise_layer(*positionwise_layer_args), normalize_before,
dropout, dropout, concat_after) for _ in range(num_blocks)
concat_after) for _ in range(num_blocks)
]) ])
if time_reduction_layer_type == 'conv1d': if time_reduction_layer_type == 'conv1d':
time_reduction_layer = TimeReductionLayer1D time_reduction_layer = TimeReductionLayer1D
@ -637,7 +639,8 @@ class SqueezeformerEncoder(nn.Layer):
time_reduction_layer = TimeReductionLayer2D time_reduction_layer = TimeReductionLayer2D
time_reduction_layer_args = {'encoder_dim': encoder_dim} time_reduction_layer_args = {'encoder_dim': encoder_dim}
self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) self.time_reduction_layer = time_reduction_layer(
**time_reduction_layer_args)
self.time_recover_layer = Linear(encoder_dim, encoder_dim) self.time_recover_layer = Linear(encoder_dim, encoder_dim)
self.final_proj = None self.final_proj = None
if output_size != encoder_dim: if output_size != encoder_dim:
@ -650,8 +653,8 @@ class SqueezeformerEncoder(nn.Layer):
self, self,
xs: paddle.Tensor, xs: paddle.Tensor,
xs_lens: paddle.Tensor, xs_lens: paddle.Tensor,
decoding_chunk_size: int = 0, decoding_chunk_size: int=0,
num_decoding_left_chunks: int = -1, num_decoding_left_chunks: int=-1,
) -> Tuple[paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Embed positions in tensor. """Embed positions in tensor.
Args: Args:
@ -674,12 +677,10 @@ class SqueezeformerEncoder(nn.Layer):
xs = self.global_cmvn(xs) xs = self.global_cmvn(xs)
xs, pos_emb, masks = self.embed(xs, masks) xs, pos_emb, masks = self.embed(xs, masks)
mask_pad = ~masks mask_pad = ~masks
chunk_masks = add_optional_chunk_mask(xs, masks, chunk_masks = add_optional_chunk_mask(
self.use_dynamic_chunk, xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
self.use_dynamic_left_chunk, decoding_chunk_size, self.static_chunk_size,
decoding_chunk_size, num_decoding_left_chunks)
self.static_chunk_size,
num_decoding_left_chunks)
xs_lens = chunk_masks.squeeze(1).sum(1) xs_lens = chunk_masks.squeeze(1).sum(1)
xs = self.preln(xs) xs = self.preln(xs)
recover_activations: \ recover_activations: \
@ -688,15 +689,18 @@ class SqueezeformerEncoder(nn.Layer):
for i, layer in enumerate(self.encoders): for i, layer in enumerate(self.encoders):
if self.reduce_idx is not None: if self.reduce_idx is not None:
if self.time_reduce is not None and i in self.reduce_idx: if self.time_reduce is not None and i in self.reduce_idx:
recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) recover_activations.append(
xs, xs_lens, chunk_masks, mask_pad = self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) (xs, chunk_masks, pos_emb, mask_pad))
xs, xs_lens, chunk_masks, mask_pad = self.time_reduction_layer(
xs, xs_lens, chunk_masks, mask_pad)
pos_emb = pos_emb[:, ::2, :] pos_emb = pos_emb[:, ::2, :]
index += 1 index += 1
if self.recover_idx is not None: if self.recover_idx is not None:
if self.time_reduce == 'recover' and i in self.recover_idx: if self.time_reduce == 'recover' and i in self.recover_idx:
index -= 1 index -= 1
recover_tensor, recover_chunk_masks, recover_pos_emb, recover_mask_pad = recover_activations[index] recover_tensor, recover_chunk_masks, recover_pos_emb, recover_mask_pad = recover_activations[
index]
# recover output length for ctc decode # recover output length for ctc decode
xs = paddle.repeat_interleave(xs, repeats=2, axis=1) xs = paddle.repeat_interleave(xs, repeats=2, axis=1)
xs = self.time_recover_layer(xs) xs = self.time_recover_layer(xs)
@ -732,16 +736,16 @@ class SqueezeformerEncoder(nn.Layer):
for exp, rc_idx in enumerate(self.recover_idx): for exp, rc_idx in enumerate(self.recover_idx):
if i >= rc_idx: if i >= rc_idx:
recover_exp = exp + 1 recover_exp = exp + 1
return int(2 ** (reduce_exp - recover_exp)) return int(2**(reduce_exp - recover_exp))
def forward_chunk( def forward_chunk(
self, self,
xs: paddle.Tensor, xs: paddle.Tensor,
offset: int, offset: int,
required_cache_size: int, required_cache_size: int,
att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
att_mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), att_mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Forward just one chunk """ Forward just one chunk
@ -786,7 +790,8 @@ class SqueezeformerEncoder(nn.Layer):
elayers, cache_t1 = att_cache.shape[0], att_cache.shape[2] elayers, cache_t1 = att_cache.shape[0], att_cache.shape[2]
chunk_size = xs.shape[1] chunk_size = xs.shape[1]
attention_key_size = cache_t1 + chunk_size attention_key_size = cache_t1 + chunk_size
pos_emb = self.embed.position_encoding(offset=offset - cache_t1, size=attention_key_size) pos_emb = self.embed.position_encoding(
offset=offset - cache_t1, size=attention_key_size)
if required_cache_size < 0: if required_cache_size < 0:
next_cache_start = 0 next_cache_start = 0
elif required_cache_size == 0: elif required_cache_size == 0:
@ -811,15 +816,18 @@ class SqueezeformerEncoder(nn.Layer):
# shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2)
if self.reduce_idx is not None: if self.reduce_idx is not None:
if self.time_reduce is not None and i in self.reduce_idx: if self.time_reduce is not None and i in self.reduce_idx:
recover_activations.append((xs, att_mask, pos_emb, mask_pad)) recover_activations.append(
xs, xs_lens, att_mask, mask_pad = self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) (xs, att_mask, pos_emb, mask_pad))
xs, xs_lens, att_mask, mask_pad = self.time_reduction_layer(
xs, xs_lens, att_mask, mask_pad)
pos_emb = pos_emb[:, ::2, :] pos_emb = pos_emb[:, ::2, :]
index += 1 index += 1
if self.recover_idx is not None: if self.recover_idx is not None:
if self.time_reduce == 'recover' and i in self.recover_idx: if self.time_reduce == 'recover' and i in self.recover_idx:
index -= 1 index -= 1
recover_tensor, recover_att_mask, recover_pos_emb, recover_mask_pad = recover_activations[index] recover_tensor, recover_att_mask, recover_pos_emb, recover_mask_pad = recover_activations[
index]
# recover output length for ctc decode # recover output length for ctc decode
xs = paddle.repeat_interleave(xs, repeats=2, axis=1) xs = paddle.repeat_interleave(xs, repeats=2, axis=1)
xs = self.time_recover_layer(xs) xs = self.time_recover_layer(xs)
@ -830,7 +838,9 @@ class SqueezeformerEncoder(nn.Layer):
mask_pad = recover_mask_pad mask_pad = recover_mask_pad
factor = self.calculate_downsampling_factor(i) factor = self.calculate_downsampling_factor(i)
att_cache1 = att_cache[i:i + 1][:, :, ::factor, :][:, :, :pos_emb.shape[1] - xs.shape[1], :] att_cache1 = att_cache[
i:i + 1][:, :, ::factor, :][:, :, :pos_emb.shape[1] - xs.shape[
1], :]
cnn_cache1 = cnn_cache[i] if cnn_cache.shape[0] > 0 else cnn_cache cnn_cache1 = cnn_cache[i] if cnn_cache.shape[0] > 0 else cnn_cache
xs, _, new_att_cache, new_cnn_cache = layer( xs, _, new_att_cache, new_cnn_cache = layer(
xs, xs,

@ -26,7 +26,10 @@ from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = ["TransformerEncoderLayer", "ConformerEncoderLayer", "SqueezeformerEncoderLayer"] __all__ = [
"TransformerEncoderLayer", "ConformerEncoderLayer",
"SqueezeformerEncoderLayer"
]
class TransformerEncoderLayer(nn.Layer): class TransformerEncoderLayer(nn.Layer):
@ -281,16 +284,15 @@ class ConformerEncoderLayer(nn.Layer):
class SqueezeformerEncoderLayer(nn.Layer): class SqueezeformerEncoderLayer(nn.Layer):
"""Encoder layer module.""" """Encoder layer module."""
def __init__( def __init__(self,
self, size: int,
size: int, self_attn: paddle.nn.Layer,
self_attn: paddle.nn.Layer, feed_forward1: Optional[nn.Layer]=None,
feed_forward1: Optional[nn.Layer] = None, conv_module: Optional[nn.Layer]=None,
conv_module: Optional[nn.Layer] = None, feed_forward2: Optional[nn.Layer]=None,
feed_forward2: Optional[nn.Layer] = None, normalize_before: bool=False,
normalize_before: bool = False, dropout_rate: float=0.1,
dropout_rate: float = 0.1, concat_after: bool=False):
concat_after: bool = False):
"""Construct an EncoderLayer object. """Construct an EncoderLayer object.
Args: Args:
@ -332,9 +334,9 @@ class SqueezeformerEncoderLayer(nn.Layer):
x: paddle.Tensor, x: paddle.Tensor,
mask: paddle.Tensor, mask: paddle.Tensor,
pos_emb: paddle.Tensor, pos_emb: paddle.Tensor,
mask_pad: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute encoded features. """Compute encoded features.
Args: Args:

@ -16,8 +16,8 @@
"""Positionwise feed forward layer definition.""" """Positionwise feed forward layer definition."""
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import initializer as I from paddle.nn import initializer as I
from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
@ -33,7 +33,7 @@ class PositionwiseFeedForward(nn.Layer):
idim: int, idim: int,
hidden_units: int, hidden_units: int,
dropout_rate: float, dropout_rate: float,
activation: nn.Layer = nn.ReLU()): activation: nn.Layer=nn.ReLU()):
"""Construct a PositionwiseFeedForward object. """Construct a PositionwiseFeedForward object.
FeedForward are appied on each position of the sequence. FeedForward are appied on each position of the sequence.
@ -78,9 +78,9 @@ class PositionwiseFeedForward2(paddle.nn.Layer):
idim: int, idim: int,
hidden_units: int, hidden_units: int,
dropout_rate: float, dropout_rate: float,
activation: paddle.nn.Layer = paddle.nn.ReLU(), activation: paddle.nn.Layer=paddle.nn.ReLU(),
adaptive_scale: bool = False, adaptive_scale: bool=False,
init_weights: bool = False): init_weights: bool=False):
"""Construct a PositionwiseFeedForward object.""" """Construct a PositionwiseFeedForward object."""
super(PositionwiseFeedForward2, self).__init__() super(PositionwiseFeedForward2, self).__init__()
self.idim = idim self.idim = idim
@ -90,21 +90,27 @@ class PositionwiseFeedForward2(paddle.nn.Layer):
self.dropout = paddle.nn.Dropout(dropout_rate) self.dropout = paddle.nn.Dropout(dropout_rate)
self.w_2 = Linear(hidden_units, idim) self.w_2 = Linear(hidden_units, idim)
self.adaptive_scale = adaptive_scale self.adaptive_scale = adaptive_scale
ada_scale = self.create_parameter([1, 1, idim], default_initializer=I.XavierUniform()) ada_scale = self.create_parameter(
[1, 1, idim], default_initializer=I.XavierUniform())
self.add_parameter('ada_scale', ada_scale) self.add_parameter('ada_scale', ada_scale)
ada_bias = self.create_parameter([1, 1, idim], default_initializer=I.XavierUniform()) ada_bias = self.create_parameter(
[1, 1, idim], default_initializer=I.XavierUniform())
self.add_parameter('ada_bias', ada_bias) self.add_parameter('ada_bias', ada_bias)
if init_weights: if init_weights:
self.init_weights() self.init_weights()
def init_weights(self): def init_weights(self):
ffn1_max = self.idim ** -0.5 ffn1_max = self.idim**-0.5
ffn2_max = self.hidden_units ** -0.5 ffn2_max = self.hidden_units**-0.5
self.w_1._param_attr = paddle.nn.initializer.Uniform(low=-ffn1_max, high=ffn1_max) self.w_1._param_attr = paddle.nn.initializer.Uniform(
self.w_1._bias_attr = paddle.nn.initializer.Uniform(low=-ffn1_max, high=ffn1_max) low=-ffn1_max, high=ffn1_max)
self.w_2._param_attr = paddle.nn.initializer.Uniform(low=-ffn2_max, high=ffn2_max) self.w_1._bias_attr = paddle.nn.initializer.Uniform(
self.w_2._bias_attr = paddle.nn.initializer.Uniform(low=-ffn2_max, high=ffn2_max) low=-ffn1_max, high=ffn1_max)
self.w_2._param_attr = paddle.nn.initializer.Uniform(
low=-ffn2_max, high=ffn2_max)
self.w_2._bias_attr = paddle.nn.initializer.Uniform(
low=-ffn2_max, high=ffn2_max)
def forward(self, xs: paddle.Tensor) -> paddle.Tensor: def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
"""Forward function. """Forward function.

@ -21,7 +21,8 @@ import paddle.nn.functional as F
from paddle import nn from paddle import nn
from paddlespeech.s2t import masked_fill from paddlespeech.s2t import masked_fill
from paddlespeech.s2t.modules.align import Conv2D, Conv1D from paddlespeech.s2t.modules.align import Conv1D
from paddlespeech.s2t.modules.align import Conv2D
from paddlespeech.s2t.modules.align import LayerNorm from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.conv2d import Conv2DValid from paddlespeech.s2t.modules.conv2d import Conv2DValid
@ -267,40 +268,46 @@ class DepthwiseConv2DSubsampling4(BaseSubsampling):
""" """
def __init__( def __init__(self,
self, idim: int, odim: int, idim: int,
pos_enc_class: nn.Layer, odim: int,
dw_stride: bool = False, pos_enc_class: nn.Layer,
input_size: int = 80, dw_stride: bool=False,
input_dropout_rate: float = 0.1, input_size: int=80,
init_weights: bool = True): input_dropout_rate: float=0.1,
init_weights: bool=True):
super(DepthwiseConv2DSubsampling4, self).__init__() super(DepthwiseConv2DSubsampling4, self).__init__()
self.idim = idim self.idim = idim
self.odim = odim self.odim = odim
self.pw_conv = Conv2D(in_channels=idim, out_channels=odim, kernel_size=3, stride=2) self.pw_conv = Conv2D(
in_channels=idim, out_channels=odim, kernel_size=3, stride=2)
self.act1 = nn.ReLU() self.act1 = nn.ReLU()
self.dw_conv = Conv2D(in_channels=odim, out_channels=odim, kernel_size=3, stride=2, self.dw_conv = Conv2D(
groups=odim if dw_stride else 1) in_channels=odim,
out_channels=odim,
kernel_size=3,
stride=2,
groups=odim if dw_stride else 1)
self.act2 = nn.ReLU() self.act2 = nn.ReLU()
self.pos_enc = pos_enc_class self.pos_enc = pos_enc_class
self.input_proj = nn.Sequential( self.input_proj = nn.Sequential(
Linear(odim * (((input_size - 1) // 2 - 1) // 2), odim), Linear(odim * (((input_size - 1) // 2 - 1) // 2), odim),
nn.Dropout(p=input_dropout_rate)) nn.Dropout(p=input_dropout_rate))
if init_weights: if init_weights:
linear_max = (odim * input_size / 4) ** -0.5 linear_max = (odim * input_size / 4)**-0.5
self.input_proj.state_dict()['0.weight'] = paddle.nn.initializer.Uniform(low=-linear_max, high=linear_max) self.input_proj.state_dict()[
self.input_proj.state_dict()['0.bias'] = paddle.nn.initializer.Uniform(low=-linear_max, high=linear_max) '0.weight'] = paddle.nn.initializer.Uniform(
low=-linear_max, high=linear_max)
self.input_proj.state_dict()[
'0.bias'] = paddle.nn.initializer.Uniform(
low=-linear_max, high=linear_max)
self.subsampling_rate = 4 self.subsampling_rate = 4
# 6 = (3 - 1) * 1 + (3 - 1) * 2 # 6 = (3 - 1) * 1 + (3 - 1) * 2
self.right_context = 6 self.right_context = 6
def forward( def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
self, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
x: paddle.Tensor,
x_mask: paddle.Tensor,
offset: int = 0
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
x = x.unsqueeze(1) # (b, c=1, t, f) x = x.unsqueeze(1) # (b, c=1, t, f)
x = self.pw_conv(x) x = self.pw_conv(x)
x = self.act1(x) x = self.act1(x)
@ -327,7 +334,11 @@ class TimeReductionLayer1D(nn.Layer):
stride (int): Downsampling factor in time dimension. stride (int): Downsampling factor in time dimension.
""" """
def __init__(self, channel: int, out_dim: int, kernel_size: int = 5, stride: int = 2): def __init__(self,
channel: int,
out_dim: int,
kernel_size: int=5,
stride: int=2):
super(TimeReductionLayer1D, self).__init__() super(TimeReductionLayer1D, self).__init__()
self.channel = channel self.channel = channel
@ -342,28 +353,37 @@ class TimeReductionLayer1D(nn.Layer):
kernel_size=kernel_size, kernel_size=kernel_size,
stride=stride, stride=stride,
padding=self.padding, padding=self.padding,
groups=channel, groups=channel, )
)
self.pw_conv = Conv1D( self.pw_conv = Conv1D(
in_channels=channel, out_channels=out_dim, in_channels=channel,
kernel_size=1, stride=1, padding=0, groups=1, out_channels=out_dim,
) kernel_size=1,
stride=1,
padding=0,
groups=1, )
self.init_weights() self.init_weights()
def init_weights(self): def init_weights(self):
dw_max = self.kernel_size ** -0.5 dw_max = self.kernel_size**-0.5
pw_max = self.channel ** -0.5 pw_max = self.channel**-0.5
self.dw_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
def forward(self, xs, xs_lens: paddle.Tensor, low=-pw_max, high=pw_max)
mask: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
mask_pad: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), low=-pw_max, high=pw_max)
):
def forward(
self,
xs,
xs_lens: paddle.Tensor,
mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
mask_pad: paddle.Tensor=paddle.ones((0, 0, 0),
dtype=paddle.bool), ):
xs = xs.transpose([0, 2, 1]) # [B, C, T] xs = xs.transpose([0, 2, 1]) # [B, C, T]
xs = masked_fill(xs, mask_pad.equal(0), 0.0) xs = masked_fill(xs, mask_pad.equal(0), 0.0)
@ -388,50 +408,60 @@ class TimeReductionLayer1D(nn.Layer):
class TimeReductionLayer2D(nn.Layer): class TimeReductionLayer2D(nn.Layer):
def __init__(self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): def __init__(self, kernel_size: int=5, stride: int=2, encoder_dim: int=256):
super(TimeReductionLayer2D, self).__init__() super(TimeReductionLayer2D, self).__init__()
self.encoder_dim = encoder_dim self.encoder_dim = encoder_dim
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.dw_conv = Conv2DValid(in_channels=encoder_dim, self.dw_conv = Conv2DValid(
out_channels=encoder_dim, in_channels=encoder_dim,
kernel_size=(kernel_size, 1), out_channels=encoder_dim,
stride=stride, kernel_size=(kernel_size, 1),
valid_trigy=True) stride=stride,
self.pw_conv = Conv2DValid(in_channels=encoder_dim, valid_trigy=True)
out_channels=encoder_dim, self.pw_conv = Conv2DValid(
kernel_size=1, in_channels=encoder_dim,
stride=1, out_channels=encoder_dim,
valid_trigx=False, kernel_size=1,
valid_trigy=False) stride=1,
valid_trigx=False,
valid_trigy=False)
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.stride = stride self.stride = stride
self.init_weights() self.init_weights()
def init_weights(self): def init_weights(self):
dw_max = self.kernel_size ** -0.5 dw_max = self.kernel_size**-0.5
pw_max = self.encoder_dim ** -0.5 pw_max = self.encoder_dim**-0.5
self.dw_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
low=-pw_max, high=pw_max)
def forward( def forward(
self, xs: paddle.Tensor, xs_lens: paddle.Tensor, self,
mask: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), xs: paddle.Tensor,
mask_pad: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), xs_lens: paddle.Tensor,
mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
mask_pad: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
xs = masked_fill(xs, mask_pad.transpose([0, 2, 1]).equal(0), 0.0) xs = masked_fill(xs, mask_pad.transpose([0, 2, 1]).equal(0), 0.0)
xs = xs.unsqueeze(1) xs = xs.unsqueeze(1)
padding1 = self.kernel_size - self.stride padding1 = self.kernel_size - self.stride
xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), mode='constant', value=0.) xs = F.pad(
xs, (0, 0, 0, 0, 0, padding1, 0, 0), mode='constant', value=0.)
xs = self.dw_conv(xs.transpose([0, 3, 2, 1])) xs = self.dw_conv(xs.transpose([0, 3, 2, 1]))
xs = self.pw_conv(xs).transpose([0, 3, 2, 1]).squeeze(1) xs = self.pw_conv(xs).transpose([0, 3, 2, 1]).squeeze(1)
tmp_length = xs.shape[1] tmp_length = xs.shape[1]
xs_lens = (xs_lens + 1) // 2 xs_lens = (xs_lens + 1) // 2
padding2 = max(0, (xs_lens.max() - tmp_length).item()) padding2 = max(0, (xs_lens.max() - tmp_length).item())
batch_size, hidden = xs.shape[0], xs.shape[-1] batch_size, hidden = xs.shape[0], xs.shape[-1]
dummy_pad = paddle.zeros([batch_size, padding2, hidden], dtype=paddle.float32) dummy_pad = paddle.zeros(
[batch_size, padding2, hidden], dtype=paddle.float32)
xs = paddle.concat([xs, dummy_pad], axis=1) xs = paddle.concat([xs, dummy_pad], axis=1)
mask = mask[:, ::2, ::2] mask = mask[:, ::2, ::2]
mask_pad = mask_pad[:, :, ::2] mask_pad = mask_pad[:, :, ::2]
@ -451,8 +481,11 @@ class TimeReductionLayerStream(nn.Layer):
stride (int): Downsampling factor in time dimension. stride (int): Downsampling factor in time dimension.
""" """
def __init__(self, channel: int, out_dim: int, def __init__(self,
kernel_size: int = 1, stride: int = 2): channel: int,
out_dim: int,
kernel_size: int=1,
stride: int=2):
super(TimeReductionLayerStream, self).__init__() super(TimeReductionLayerStream, self).__init__()
self.channel = channel self.channel = channel
@ -460,32 +493,41 @@ class TimeReductionLayerStream(nn.Layer):
self.kernel_size = kernel_size self.kernel_size = kernel_size
self.stride = stride self.stride = stride
self.dw_conv = Conv1D(in_channels=channel, self.dw_conv = Conv1D(
out_channels=channel, in_channels=channel,
kernel_size=kernel_size, out_channels=channel,
stride=stride, kernel_size=kernel_size,
padding=0, stride=stride,
groups=channel) padding=0,
groups=channel)
self.pw_conv = Conv1D(in_channels=channel,
out_channels=out_dim, self.pw_conv = Conv1D(
kernel_size=1, in_channels=channel,
stride=1, out_channels=out_dim,
padding=0, kernel_size=1,
groups=1) stride=1,
padding=0,
groups=1)
self.init_weights() self.init_weights()
def init_weights(self): def init_weights(self):
dw_max = self.kernel_size ** -0.5 dw_max = self.kernel_size**-0.5
pw_max = self.channel ** -0.5 pw_max = self.channel**-0.5
self.dw_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) low=-dw_max, high=dw_max)
self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
def forward(self, xs, xs_lens: paddle.Tensor, low=-pw_max, high=pw_max)
mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
mask_pad: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool)): low=-pw_max, high=pw_max)
def forward(
self,
xs,
xs_lens: paddle.Tensor,
mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)):
xs = xs.transpose([0, 2, 1]) # [B, C, T] xs = xs.transpose([0, 2, 1]) # [B, C, T]
xs = masked_fill(xs, mask_pad.equal(0), 0.0) xs = masked_fill(xs, mask_pad.equal(0), 0.0)

Loading…
Cancel
Save