From 34acf5f970203627f05f31d5698d1b7c8fe9da8d Mon Sep 17 00:00:00 2001 From: yeyupiaoling Date: Tue, 20 Dec 2022 10:45:58 +0800 Subject: [PATCH] change CodeStyle, test=asr --- paddlespeech/s2t/modules/attention.py | 87 ++++--- paddlespeech/s2t/modules/conv2d.py | 74 +++--- paddlespeech/s2t/modules/convolution.py | 49 ++-- paddlespeech/s2t/modules/encoder.py | 190 ++++++++-------- paddlespeech/s2t/modules/encoder_layer.py | 30 +-- .../s2t/modules/positionwise_feed_forward.py | 32 +-- paddlespeech/s2t/modules/subsampling.py | 212 +++++++++++------- 7 files changed, 389 insertions(+), 285 deletions(-) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 29d26c60c..6347bdb12 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -26,7 +26,10 @@ from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() -__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention", "RelPositionMultiHeadedAttention2"] +__all__ = [ + "MultiHeadedAttention", "RelPositionMultiHeadedAttention", + "RelPositionMultiHeadedAttention2" +] # Relative Positional Encodings # https://www.jianshu.com/p/c0608efcc26f @@ -341,7 +344,13 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention): dropout_rate (float): Dropout rate. """ - def __init__(self, n_head, n_feat, dropout_rate, do_rel_shift=False, adaptive_scale=False, init_weights=False): + def __init__(self, + n_head, + n_feat, + dropout_rate, + do_rel_shift=False, + adaptive_scale=False, + init_weights=False): """Construct an RelPositionMultiHeadedAttention object.""" super().__init__(n_head, n_feat, dropout_rate) # linear transformation for positional encoding @@ -349,32 +358,46 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention): # these two learnable bias are used in matrix c and matrix d # as described in https://arxiv.org/abs/1901.02860 Section 3.3 self.do_rel_shift = do_rel_shift - pos_bias_u = self.create_parameter([self.h, self.d_k], default_initializer=I.XavierUniform()) + pos_bias_u = self.create_parameter( + [self.h, self.d_k], default_initializer=I.XavierUniform()) self.add_parameter('pos_bias_u', pos_bias_u) - pos_bias_v = self.create_parameter([self.h, self.d_k], default_initializer=I.XavierUniform()) + pos_bias_v = self.create_parameter( + [self.h, self.d_k], default_initializer=I.XavierUniform()) self.add_parameter('pos_bias_v', pos_bias_v) self.adaptive_scale = adaptive_scale - ada_scale = self.create_parameter([1, 1, n_feat], default_initializer=I.Constant(1.0)) + ada_scale = self.create_parameter( + [1, 1, n_feat], default_initializer=I.Constant(1.0)) self.add_parameter('ada_scale', ada_scale) - ada_bias = self.create_parameter([1, 1, n_feat], default_initializer=I.Constant(0.0)) + ada_bias = self.create_parameter( + [1, 1, n_feat], default_initializer=I.Constant(0.0)) self.add_parameter('ada_bias', ada_bias) if init_weights: self.init_weights() def init_weights(self): - input_max = (self.h * self.d_k) ** -0.5 - self.linear_q._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - self.linear_q._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - self.linear_k._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - self.linear_k._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - self.linear_v._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - self.linear_v._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - self.linear_pos._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - self.linear_pos._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - self.linear_out._param_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - self.linear_out._bias_attr = paddle.nn.initializer.Uniform(low=-input_max, high=input_max) - - def rel_shift(self, x, zero_triu: bool = False): + input_max = (self.h * self.d_k)**-0.5 + self.linear_q._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_q._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_k._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_k._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_v._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_v._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_pos._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_pos._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_out._param_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + self.linear_out._bias_attr = paddle.nn.initializer.Uniform( + low=-input_max, high=input_max) + + def rel_shift(self, x, zero_triu: bool=False): """Compute relative positinal encoding. Args: x (paddle.Tensor): Input tensor (batch, head, time1, time1). @@ -383,10 +406,12 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention): Returns: paddle.Tensor: Output tensor. (batch, head, time1, time1) """ - zero_pad = paddle.zeros([x.shape[0], x.shape[1], x.shape[2], 1], dtype=x.dtype) + zero_pad = paddle.zeros( + [x.shape[0], x.shape[1], x.shape[2], 1], dtype=x.dtype) x_padded = paddle.concat([zero_pad, x], axis=-1) - x_padded = x_padded.reshape([x.shape[0], x.shape[1], x.shape[3] + 1, x.shape[2]]) + x_padded = x_padded.reshape( + [x.shape[0], x.shape[1], x.shape[3] + 1, x.shape[2]]) x = x_padded[:, :, 1:].reshape(paddle.shape(x)) # [B, H, T1, T1] if zero_triu: @@ -395,12 +420,14 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention): return x - def forward(self, query: paddle.Tensor, - key: paddle.Tensor, value: paddle.Tensor, - mask: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), - pos_emb: paddle.Tensor = paddle.empty([0]), - cache: paddle.Tensor = paddle.zeros((0, 0, 0, 0)) - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward(self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool), + pos_emb: paddle.Tensor=paddle.empty([0]), + cache: paddle.Tensor=paddle.zeros( + (0, 0, 0, 0))) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). @@ -434,7 +461,8 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention): new_cache = paddle.concat((k, v), axis=-1) n_batch_pos = pos_emb.shape[0] - p = self.linear_pos(pos_emb).reshape([n_batch_pos, -1, self.h, self.d_k]) + p = self.linear_pos(pos_emb).reshape( + [n_batch_pos, -1, self.h, self.d_k]) p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) # (batch, head, time1, d_k) @@ -460,6 +488,7 @@ class RelPositionMultiHeadedAttention2(MultiHeadedAttention): if self.do_rel_shift: matrix_bd = self.rel_shift(matrix_bd) - scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k) # (batch, head, time1, time2) + scores = (matrix_ac + matrix_bd) / math.sqrt( + self.d_k) # (batch, head, time1, time2) return self.forward_attention(v, scores, mask), new_cache diff --git a/paddlespeech/s2t/modules/conv2d.py b/paddlespeech/s2t/modules/conv2d.py index 4b41d80a4..ca6e136ad 100644 --- a/paddlespeech/s2t/modules/conv2d.py +++ b/paddlespeech/s2t/modules/conv2d.py @@ -1,4 +1,5 @@ -from typing import Union, Optional +from typing import Optional +from typing import Union import paddle import paddle.nn.functional as F @@ -12,45 +13,50 @@ class Conv2DValid(_ConvNd): Conv2d operator for VALID mode padding. """ - def __init__( - self, - in_channels: int, - out_channels: int, - kernel_size: int, - stride: int = 1, - padding: Union[str, int] = 0, - dilation: int = 1, - groups: int = 1, - padding_mode: str = 'zeros', - weight_attr=None, - bias_attr=None, - data_format="NCHW", - valid_trigx: bool = False, - valid_trigy: bool = False - ) -> None: - super(Conv2DValid, self).__init__(in_channels, - out_channels, - kernel_size, - False, - 2, - stride=stride, - padding=padding, - padding_mode=padding_mode, - dilation=dilation, - groups=groups, - weight_attr=weight_attr, - bias_attr=bias_attr, - data_format=data_format) + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int=1, + padding: Union[str, int]=0, + dilation: int=1, + groups: int=1, + padding_mode: str='zeros', + weight_attr=None, + bias_attr=None, + data_format="NCHW", + valid_trigx: bool=False, + valid_trigy: bool=False) -> None: + super(Conv2DValid, self).__init__( + in_channels, + out_channels, + kernel_size, + False, + 2, + stride=stride, + padding=padding, + padding_mode=padding_mode, + dilation=dilation, + groups=groups, + weight_attr=weight_attr, + bias_attr=bias_attr, + data_format=data_format) self.valid_trigx = valid_trigx self.valid_trigy = valid_trigy - def _conv_forward(self, input: paddle.Tensor, weight: paddle.Tensor, bias: Optional[paddle.Tensor]): + def _conv_forward(self, + input: paddle.Tensor, + weight: paddle.Tensor, + bias: Optional[paddle.Tensor]): validx, validy = 0, 0 if self.valid_trigx: - validx = (input.shape[-2] * (self._stride[-2] - 1) - 1 + self._kernel_size[-2]) // 2 + validx = (input.shape[-2] * + (self._stride[-2] - 1) - 1 + self._kernel_size[-2]) // 2 if self.valid_trigy: - validy = (input.shape[-1] * (self._stride[-1] - 1) - 1 + self._kernel_size[-1]) // 2 - return F.conv2d(input, weight, bias, self._stride, (validx, validy), self._dilation, self._groups) + validy = (input.shape[-1] * + (self._stride[-1] - 1) - 1 + self._kernel_size[-1]) // 2 + return F.conv2d(input, weight, bias, self._stride, (validx, validy), + self._dilation, self._groups) def forward(self, input: paddle.Tensor) -> paddle.Tensor: return self._conv_forward(input, self.weight, self.bias) diff --git a/paddlespeech/s2t/modules/convolution.py b/paddlespeech/s2t/modules/convolution.py index 470186765..caaa98566 100644 --- a/paddlespeech/s2t/modules/convolution.py +++ b/paddlespeech/s2t/modules/convolution.py @@ -16,13 +16,13 @@ class ConvolutionModule2(nn.Layer): def __init__(self, channels: int, - kernel_size: int = 15, - activation: nn.Layer = nn.ReLU(), - norm: str = "batch_norm", - causal: bool = False, - bias: bool = True, - adaptive_scale: bool = False, - init_weights: bool = False): + kernel_size: int=15, + activation: nn.Layer=nn.ReLU(), + norm: str="batch_norm", + causal: bool=False, + bias: bool=True, + adaptive_scale: bool=False, + init_weights: bool=False): """Construct an ConvolutionModule object. Args: channels (int): The number of channels of conv layers. @@ -35,9 +35,11 @@ class ConvolutionModule2(nn.Layer): self.channels = channels self.kernel_size = kernel_size self.adaptive_scale = adaptive_scale - ada_scale = self.create_parameter([1, 1, channels], default_initializer=I.Constant(1.0)) + ada_scale = self.create_parameter( + [1, 1, channels], default_initializer=I.Constant(1.0)) self.add_parameter('ada_scale', ada_scale) - ada_bias = self.create_parameter([1, 1, channels], default_initializer=I.Constant(0.0)) + ada_bias = self.create_parameter( + [1, 1, channels], default_initializer=I.Constant(0.0)) self.add_parameter('ada_bias', ada_bias) self.pointwise_conv1 = Conv1D( @@ -96,23 +98,29 @@ class ConvolutionModule2(nn.Layer): self.init_weights() def init_weights(self): - pw_max = self.channels ** -0.5 - dw_max = self.kernel_size ** -0.5 - self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) + pw_max = self.channels**-0.5 + dw_max = self.kernel_size**-0.5 + self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) if self.bias: - self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) - self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) + self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) if self.bias: - self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) - self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) + self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) if self.bias: - self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) + self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) def forward( self, x: paddle.Tensor, - mask_pad: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), - cache: paddle.Tensor = paddle.zeros([0, 0, 0]), + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + cache: paddle.Tensor=paddle.zeros([0, 0, 0]), ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute convolution module. Args: @@ -137,7 +145,8 @@ class ConvolutionModule2(nn.Layer): if self.lorder > 0: if cache.shape[2] == 0: # cache_t == 0 - x = nn.functional.pad(x, [self.lorder, 0], 'constant', 0.0, data_format='NCL') + x = nn.functional.pad( + x, [self.lorder, 0], 'constant', 0.0, data_format='NCL') else: assert cache.shape[0] == x.shape[0] # B assert cache.shape[1] == x.shape[1] # C diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 6063e95dc..f19ecfe41 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -14,36 +14,49 @@ # limitations under the License. # Modified from wenet(https://github.com/wenet-e2e/wenet) """Encoder definition.""" -from typing import Tuple, Union, Optional, List +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union import paddle from paddle import nn from typeguard import check_argument_types from paddlespeech.s2t.modules.activation import get_activation -from paddlespeech.s2t.modules.align import LayerNorm, Linear -from paddlespeech.s2t.modules.attention import MultiHeadedAttention, RelPositionMultiHeadedAttention2 +from paddlespeech.s2t.modules.align import LayerNorm +from paddlespeech.s2t.modules.align import Linear +from paddlespeech.s2t.modules.attention import MultiHeadedAttention from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention +from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention2 from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule from paddlespeech.s2t.modules.convolution import ConvolutionModule2 from paddlespeech.s2t.modules.embedding import NoPositionalEncoding from paddlespeech.s2t.modules.embedding import PositionalEncoding from paddlespeech.s2t.modules.embedding import RelPositionalEncoding -from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer, SqueezeformerEncoderLayer +from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer +from paddlespeech.s2t.modules.encoder_layer import SqueezeformerEncoderLayer from paddlespeech.s2t.modules.encoder_layer import TransformerEncoderLayer from paddlespeech.s2t.modules.mask import add_optional_chunk_mask from paddlespeech.s2t.modules.mask import make_non_pad_mask -from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward, PositionwiseFeedForward2 -from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4, TimeReductionLayerStream, TimeReductionLayer1D, \ - DepthwiseConv2DSubsampling4, TimeReductionLayer2D +from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward +from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward2 +from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling6 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling8 +from paddlespeech.s2t.modules.subsampling import DepthwiseConv2DSubsampling4 from paddlespeech.s2t.modules.subsampling import LinearNoSubsampling +from paddlespeech.s2t.modules.subsampling import TimeReductionLayer1D +from paddlespeech.s2t.modules.subsampling import TimeReductionLayer2D +from paddlespeech.s2t.modules.subsampling import TimeReductionLayerStream from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() -__all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder", "SqueezeformerEncoder"] +__all__ = [ + "BaseEncoder", 'TransformerEncoder', "ConformerEncoder", + "SqueezeformerEncoder" +] class BaseEncoder(nn.Layer): @@ -492,37 +505,35 @@ class ConformerEncoder(BaseEncoder): class SqueezeformerEncoder(nn.Layer): - def __init__( - self, - input_size: int, - encoder_dim: int = 256, - output_size: int = 256, - attention_heads: int = 4, - num_blocks: int = 12, - reduce_idx: Optional[Union[int, List[int]]] = 5, - recover_idx: Optional[Union[int, List[int]]] = 11, - feed_forward_expansion_factor: int = 4, - dw_stride: bool = False, - input_dropout_rate: float = 0.1, - pos_enc_layer_type: str = "rel_pos", - time_reduction_layer_type: str = "conv1d", - do_rel_shift: bool = True, - feed_forward_dropout_rate: float = 0.1, - attention_dropout_rate: float = 0.1, - cnn_module_kernel: int = 31, - cnn_norm_type: str = "layer_norm", - dropout: float = 0.1, - causal: bool = False, - adaptive_scale: bool = True, - activation_type: str = "swish", - init_weights: bool = True, - global_cmvn: paddle.nn.Layer = None, - normalize_before: bool = False, - use_dynamic_chunk: bool = False, - concat_after: bool = False, - static_chunk_size: int = 0, - use_dynamic_left_chunk: bool = False - ): + def __init__(self, + input_size: int, + encoder_dim: int=256, + output_size: int=256, + attention_heads: int=4, + num_blocks: int=12, + reduce_idx: Optional[Union[int, List[int]]]=5, + recover_idx: Optional[Union[int, List[int]]]=11, + feed_forward_expansion_factor: int=4, + dw_stride: bool=False, + input_dropout_rate: float=0.1, + pos_enc_layer_type: str="rel_pos", + time_reduction_layer_type: str="conv1d", + do_rel_shift: bool=True, + feed_forward_dropout_rate: float=0.1, + attention_dropout_rate: float=0.1, + cnn_module_kernel: int=31, + cnn_norm_type: str="layer_norm", + dropout: float=0.1, + causal: bool=False, + adaptive_scale: bool=True, + activation_type: str="swish", + init_weights: bool=True, + global_cmvn: paddle.nn.Layer=None, + normalize_before: bool=False, + use_dynamic_chunk: bool=False, + concat_after: bool=False, + static_chunk_size: int=0, + use_dynamic_left_chunk: bool=False): """Construct SqueezeformerEncoder Args: @@ -577,49 +588,40 @@ class SqueezeformerEncoder(nn.Layer): # self-attention module definition if pos_enc_layer_type != "rel_pos": encoder_selfattn_layer = MultiHeadedAttention - encoder_selfattn_layer_args = (attention_heads, - output_size, + encoder_selfattn_layer_args = (attention_heads, output_size, attention_dropout_rate) else: encoder_selfattn_layer = RelPositionMultiHeadedAttention2 - encoder_selfattn_layer_args = (attention_heads, - encoder_dim, - attention_dropout_rate, - do_rel_shift, - adaptive_scale, - init_weights) + encoder_selfattn_layer_args = (attention_heads, encoder_dim, + attention_dropout_rate, do_rel_shift, + adaptive_scale, init_weights) # feed-forward module definition positionwise_layer = PositionwiseFeedForward2 - positionwise_layer_args = (encoder_dim, - encoder_dim * feed_forward_expansion_factor, - feed_forward_dropout_rate, - activation, - adaptive_scale, - init_weights) + positionwise_layer_args = ( + encoder_dim, encoder_dim * feed_forward_expansion_factor, + feed_forward_dropout_rate, activation, adaptive_scale, init_weights) # convolution module definition convolution_layer = ConvolutionModule2 convolution_layer_args = (encoder_dim, cnn_module_kernel, activation, - cnn_norm_type, causal, True, adaptive_scale, init_weights) + cnn_norm_type, causal, True, adaptive_scale, + init_weights) - self.embed = DepthwiseConv2DSubsampling4(1, encoder_dim, - RelPositionalEncoding(encoder_dim, dropout_rate=0.1), - dw_stride, - input_size, - input_dropout_rate, - init_weights) + self.embed = DepthwiseConv2DSubsampling4( + 1, encoder_dim, + RelPositionalEncoding(encoder_dim, dropout_rate=0.1), dw_stride, + input_size, input_dropout_rate, init_weights) self.preln = LayerNorm(encoder_dim) - self.encoders = paddle.nn.LayerList([SqueezeformerEncoderLayer( - encoder_dim, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - convolution_layer(*convolution_layer_args), - positionwise_layer(*positionwise_layer_args), - normalize_before, - dropout, - concat_after) for _ in range(num_blocks) + self.encoders = paddle.nn.LayerList([ + SqueezeformerEncoderLayer( + encoder_dim, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + convolution_layer(*convolution_layer_args), + positionwise_layer(*positionwise_layer_args), normalize_before, + dropout, concat_after) for _ in range(num_blocks) ]) if time_reduction_layer_type == 'conv1d': time_reduction_layer = TimeReductionLayer1D @@ -637,7 +639,8 @@ class SqueezeformerEncoder(nn.Layer): time_reduction_layer = TimeReductionLayer2D time_reduction_layer_args = {'encoder_dim': encoder_dim} - self.time_reduction_layer = time_reduction_layer(**time_reduction_layer_args) + self.time_reduction_layer = time_reduction_layer( + **time_reduction_layer_args) self.time_recover_layer = Linear(encoder_dim, encoder_dim) self.final_proj = None if output_size != encoder_dim: @@ -650,8 +653,8 @@ class SqueezeformerEncoder(nn.Layer): self, xs: paddle.Tensor, xs_lens: paddle.Tensor, - decoding_chunk_size: int = 0, - num_decoding_left_chunks: int = -1, + decoding_chunk_size: int=0, + num_decoding_left_chunks: int=-1, ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Embed positions in tensor. Args: @@ -674,12 +677,10 @@ class SqueezeformerEncoder(nn.Layer): xs = self.global_cmvn(xs) xs, pos_emb, masks = self.embed(xs, masks) mask_pad = ~masks - chunk_masks = add_optional_chunk_mask(xs, masks, - self.use_dynamic_chunk, - self.use_dynamic_left_chunk, - decoding_chunk_size, - self.static_chunk_size, - num_decoding_left_chunks) + chunk_masks = add_optional_chunk_mask( + xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, + decoding_chunk_size, self.static_chunk_size, + num_decoding_left_chunks) xs_lens = chunk_masks.squeeze(1).sum(1) xs = self.preln(xs) recover_activations: \ @@ -688,15 +689,18 @@ class SqueezeformerEncoder(nn.Layer): for i, layer in enumerate(self.encoders): if self.reduce_idx is not None: if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, chunk_masks, pos_emb, mask_pad)) - xs, xs_lens, chunk_masks, mask_pad = self.time_reduction_layer(xs, xs_lens, chunk_masks, mask_pad) + recover_activations.append( + (xs, chunk_masks, pos_emb, mask_pad)) + xs, xs_lens, chunk_masks, mask_pad = self.time_reduction_layer( + xs, xs_lens, chunk_masks, mask_pad) pos_emb = pos_emb[:, ::2, :] index += 1 if self.recover_idx is not None: if self.time_reduce == 'recover' and i in self.recover_idx: index -= 1 - recover_tensor, recover_chunk_masks, recover_pos_emb, recover_mask_pad = recover_activations[index] + recover_tensor, recover_chunk_masks, recover_pos_emb, recover_mask_pad = recover_activations[ + index] # recover output length for ctc decode xs = paddle.repeat_interleave(xs, repeats=2, axis=1) xs = self.time_recover_layer(xs) @@ -732,16 +736,16 @@ class SqueezeformerEncoder(nn.Layer): for exp, rc_idx in enumerate(self.recover_idx): if i >= rc_idx: recover_exp = exp + 1 - return int(2 ** (reduce_exp - recover_exp)) + return int(2**(reduce_exp - recover_exp)) def forward_chunk( self, xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), - att_mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + att_mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Forward just one chunk @@ -786,7 +790,8 @@ class SqueezeformerEncoder(nn.Layer): elayers, cache_t1 = att_cache.shape[0], att_cache.shape[2] chunk_size = xs.shape[1] attention_key_size = cache_t1 + chunk_size - pos_emb = self.embed.position_encoding(offset=offset - cache_t1, size=attention_key_size) + pos_emb = self.embed.position_encoding( + offset=offset - cache_t1, size=attention_key_size) if required_cache_size < 0: next_cache_start = 0 elif required_cache_size == 0: @@ -811,15 +816,18 @@ class SqueezeformerEncoder(nn.Layer): # shape(cnn_cache[i]) is (b=1, hidden-dim, cache_t2) if self.reduce_idx is not None: if self.time_reduce is not None and i in self.reduce_idx: - recover_activations.append((xs, att_mask, pos_emb, mask_pad)) - xs, xs_lens, att_mask, mask_pad = self.time_reduction_layer(xs, xs_lens, att_mask, mask_pad) + recover_activations.append( + (xs, att_mask, pos_emb, mask_pad)) + xs, xs_lens, att_mask, mask_pad = self.time_reduction_layer( + xs, xs_lens, att_mask, mask_pad) pos_emb = pos_emb[:, ::2, :] index += 1 if self.recover_idx is not None: if self.time_reduce == 'recover' and i in self.recover_idx: index -= 1 - recover_tensor, recover_att_mask, recover_pos_emb, recover_mask_pad = recover_activations[index] + recover_tensor, recover_att_mask, recover_pos_emb, recover_mask_pad = recover_activations[ + index] # recover output length for ctc decode xs = paddle.repeat_interleave(xs, repeats=2, axis=1) xs = self.time_recover_layer(xs) @@ -830,7 +838,9 @@ class SqueezeformerEncoder(nn.Layer): mask_pad = recover_mask_pad factor = self.calculate_downsampling_factor(i) - att_cache1 = att_cache[i:i + 1][:, :, ::factor, :][:, :, :pos_emb.shape[1] - xs.shape[1], :] + att_cache1 = att_cache[ + i:i + 1][:, :, ::factor, :][:, :, :pos_emb.shape[1] - xs.shape[ + 1], :] cnn_cache1 = cnn_cache[i] if cnn_cache.shape[0] > 0 else cnn_cache xs, _, new_att_cache, new_cnn_cache = layer( xs, diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 08304210a..ecba95e85 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -26,7 +26,10 @@ from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() -__all__ = ["TransformerEncoderLayer", "ConformerEncoderLayer", "SqueezeformerEncoderLayer"] +__all__ = [ + "TransformerEncoderLayer", "ConformerEncoderLayer", + "SqueezeformerEncoderLayer" +] class TransformerEncoderLayer(nn.Layer): @@ -281,16 +284,15 @@ class ConformerEncoderLayer(nn.Layer): class SqueezeformerEncoderLayer(nn.Layer): """Encoder layer module.""" - def __init__( - self, - size: int, - self_attn: paddle.nn.Layer, - feed_forward1: Optional[nn.Layer] = None, - conv_module: Optional[nn.Layer] = None, - feed_forward2: Optional[nn.Layer] = None, - normalize_before: bool = False, - dropout_rate: float = 0.1, - concat_after: bool = False): + def __init__(self, + size: int, + self_attn: paddle.nn.Layer, + feed_forward1: Optional[nn.Layer]=None, + conv_module: Optional[nn.Layer]=None, + feed_forward2: Optional[nn.Layer]=None, + normalize_before: bool=False, + dropout_rate: float=0.1, + concat_after: bool=False): """Construct an EncoderLayer object. Args: @@ -332,9 +334,9 @@ class SqueezeformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), - att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py index 28488d06f..39d8b1893 100644 --- a/paddlespeech/s2t/modules/positionwise_feed_forward.py +++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py @@ -16,8 +16,8 @@ """Positionwise feed forward layer definition.""" import paddle from paddle import nn - from paddle.nn import initializer as I + from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log @@ -33,7 +33,7 @@ class PositionwiseFeedForward(nn.Layer): idim: int, hidden_units: int, dropout_rate: float, - activation: nn.Layer = nn.ReLU()): + activation: nn.Layer=nn.ReLU()): """Construct a PositionwiseFeedForward object. FeedForward are appied on each position of the sequence. @@ -78,9 +78,9 @@ class PositionwiseFeedForward2(paddle.nn.Layer): idim: int, hidden_units: int, dropout_rate: float, - activation: paddle.nn.Layer = paddle.nn.ReLU(), - adaptive_scale: bool = False, - init_weights: bool = False): + activation: paddle.nn.Layer=paddle.nn.ReLU(), + adaptive_scale: bool=False, + init_weights: bool=False): """Construct a PositionwiseFeedForward object.""" super(PositionwiseFeedForward2, self).__init__() self.idim = idim @@ -90,21 +90,27 @@ class PositionwiseFeedForward2(paddle.nn.Layer): self.dropout = paddle.nn.Dropout(dropout_rate) self.w_2 = Linear(hidden_units, idim) self.adaptive_scale = adaptive_scale - ada_scale = self.create_parameter([1, 1, idim], default_initializer=I.XavierUniform()) + ada_scale = self.create_parameter( + [1, 1, idim], default_initializer=I.XavierUniform()) self.add_parameter('ada_scale', ada_scale) - ada_bias = self.create_parameter([1, 1, idim], default_initializer=I.XavierUniform()) + ada_bias = self.create_parameter( + [1, 1, idim], default_initializer=I.XavierUniform()) self.add_parameter('ada_bias', ada_bias) if init_weights: self.init_weights() def init_weights(self): - ffn1_max = self.idim ** -0.5 - ffn2_max = self.hidden_units ** -0.5 - self.w_1._param_attr = paddle.nn.initializer.Uniform(low=-ffn1_max, high=ffn1_max) - self.w_1._bias_attr = paddle.nn.initializer.Uniform(low=-ffn1_max, high=ffn1_max) - self.w_2._param_attr = paddle.nn.initializer.Uniform(low=-ffn2_max, high=ffn2_max) - self.w_2._bias_attr = paddle.nn.initializer.Uniform(low=-ffn2_max, high=ffn2_max) + ffn1_max = self.idim**-0.5 + ffn2_max = self.hidden_units**-0.5 + self.w_1._param_attr = paddle.nn.initializer.Uniform( + low=-ffn1_max, high=ffn1_max) + self.w_1._bias_attr = paddle.nn.initializer.Uniform( + low=-ffn1_max, high=ffn1_max) + self.w_2._param_attr = paddle.nn.initializer.Uniform( + low=-ffn2_max, high=ffn2_max) + self.w_2._bias_attr = paddle.nn.initializer.Uniform( + low=-ffn2_max, high=ffn2_max) def forward(self, xs: paddle.Tensor) -> paddle.Tensor: """Forward function. diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 97c226150..51322d324 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -21,7 +21,8 @@ import paddle.nn.functional as F from paddle import nn from paddlespeech.s2t import masked_fill -from paddlespeech.s2t.modules.align import Conv2D, Conv1D +from paddlespeech.s2t.modules.align import Conv1D +from paddlespeech.s2t.modules.align import Conv2D from paddlespeech.s2t.modules.align import LayerNorm from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.conv2d import Conv2DValid @@ -267,40 +268,46 @@ class DepthwiseConv2DSubsampling4(BaseSubsampling): """ - def __init__( - self, idim: int, odim: int, - pos_enc_class: nn.Layer, - dw_stride: bool = False, - input_size: int = 80, - input_dropout_rate: float = 0.1, - init_weights: bool = True): + def __init__(self, + idim: int, + odim: int, + pos_enc_class: nn.Layer, + dw_stride: bool=False, + input_size: int=80, + input_dropout_rate: float=0.1, + init_weights: bool=True): super(DepthwiseConv2DSubsampling4, self).__init__() self.idim = idim self.odim = odim - self.pw_conv = Conv2D(in_channels=idim, out_channels=odim, kernel_size=3, stride=2) + self.pw_conv = Conv2D( + in_channels=idim, out_channels=odim, kernel_size=3, stride=2) self.act1 = nn.ReLU() - self.dw_conv = Conv2D(in_channels=odim, out_channels=odim, kernel_size=3, stride=2, - groups=odim if dw_stride else 1) + self.dw_conv = Conv2D( + in_channels=odim, + out_channels=odim, + kernel_size=3, + stride=2, + groups=odim if dw_stride else 1) self.act2 = nn.ReLU() self.pos_enc = pos_enc_class self.input_proj = nn.Sequential( Linear(odim * (((input_size - 1) // 2 - 1) // 2), odim), nn.Dropout(p=input_dropout_rate)) if init_weights: - linear_max = (odim * input_size / 4) ** -0.5 - self.input_proj.state_dict()['0.weight'] = paddle.nn.initializer.Uniform(low=-linear_max, high=linear_max) - self.input_proj.state_dict()['0.bias'] = paddle.nn.initializer.Uniform(low=-linear_max, high=linear_max) + linear_max = (odim * input_size / 4)**-0.5 + self.input_proj.state_dict()[ + '0.weight'] = paddle.nn.initializer.Uniform( + low=-linear_max, high=linear_max) + self.input_proj.state_dict()[ + '0.bias'] = paddle.nn.initializer.Uniform( + low=-linear_max, high=linear_max) self.subsampling_rate = 4 # 6 = (3 - 1) * 1 + (3 - 1) * 2 self.right_context = 6 - def forward( - self, - x: paddle.Tensor, - x_mask: paddle.Tensor, - offset: int = 0 - ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0 + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: x = x.unsqueeze(1) # (b, c=1, t, f) x = self.pw_conv(x) x = self.act1(x) @@ -327,7 +334,11 @@ class TimeReductionLayer1D(nn.Layer): stride (int): Downsampling factor in time dimension. """ - def __init__(self, channel: int, out_dim: int, kernel_size: int = 5, stride: int = 2): + def __init__(self, + channel: int, + out_dim: int, + kernel_size: int=5, + stride: int=2): super(TimeReductionLayer1D, self).__init__() self.channel = channel @@ -342,28 +353,37 @@ class TimeReductionLayer1D(nn.Layer): kernel_size=kernel_size, stride=stride, padding=self.padding, - groups=channel, - ) + groups=channel, ) self.pw_conv = Conv1D( - in_channels=channel, out_channels=out_dim, - kernel_size=1, stride=1, padding=0, groups=1, - ) + in_channels=channel, + out_channels=out_dim, + kernel_size=1, + stride=1, + padding=0, + groups=1, ) self.init_weights() def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - self.dw_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) - self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) - self.pw_conv._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) - self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) - - def forward(self, xs, xs_lens: paddle.Tensor, - mask: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), - mask_pad: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), - ): + dw_max = self.kernel_size**-0.5 + pw_max = self.channel**-0.5 + self.dw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.dw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.pw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + self.pw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + + def forward( + self, + xs, + xs_lens: paddle.Tensor, + mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool), + mask_pad: paddle.Tensor=paddle.ones((0, 0, 0), + dtype=paddle.bool), ): xs = xs.transpose([0, 2, 1]) # [B, C, T] xs = masked_fill(xs, mask_pad.equal(0), 0.0) @@ -388,50 +408,60 @@ class TimeReductionLayer1D(nn.Layer): class TimeReductionLayer2D(nn.Layer): - def __init__(self, kernel_size: int = 5, stride: int = 2, encoder_dim: int = 256): + def __init__(self, kernel_size: int=5, stride: int=2, encoder_dim: int=256): super(TimeReductionLayer2D, self).__init__() self.encoder_dim = encoder_dim self.kernel_size = kernel_size - self.dw_conv = Conv2DValid(in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=(kernel_size, 1), - stride=stride, - valid_trigy=True) - self.pw_conv = Conv2DValid(in_channels=encoder_dim, - out_channels=encoder_dim, - kernel_size=1, - stride=1, - valid_trigx=False, - valid_trigy=False) + self.dw_conv = Conv2DValid( + in_channels=encoder_dim, + out_channels=encoder_dim, + kernel_size=(kernel_size, 1), + stride=stride, + valid_trigy=True) + self.pw_conv = Conv2DValid( + in_channels=encoder_dim, + out_channels=encoder_dim, + kernel_size=1, + stride=1, + valid_trigx=False, + valid_trigy=False) self.kernel_size = kernel_size self.stride = stride self.init_weights() def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.encoder_dim ** -0.5 - self.dw_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) - self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) - self.pw_conv._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) - self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) + dw_max = self.kernel_size**-0.5 + pw_max = self.encoder_dim**-0.5 + self.dw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.dw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.pw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + self.pw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) def forward( - self, xs: paddle.Tensor, xs_lens: paddle.Tensor, - mask: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), - mask_pad: paddle.Tensor = paddle.ones((0, 0, 0), dtype=paddle.bool), + self, + xs: paddle.Tensor, + xs_lens: paddle.Tensor, + mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool), + mask_pad: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: xs = masked_fill(xs, mask_pad.transpose([0, 2, 1]).equal(0), 0.0) xs = xs.unsqueeze(1) padding1 = self.kernel_size - self.stride - xs = F.pad(xs, (0, 0, 0, 0, 0, padding1, 0, 0), mode='constant', value=0.) + xs = F.pad( + xs, (0, 0, 0, 0, 0, padding1, 0, 0), mode='constant', value=0.) xs = self.dw_conv(xs.transpose([0, 3, 2, 1])) xs = self.pw_conv(xs).transpose([0, 3, 2, 1]).squeeze(1) tmp_length = xs.shape[1] xs_lens = (xs_lens + 1) // 2 padding2 = max(0, (xs_lens.max() - tmp_length).item()) batch_size, hidden = xs.shape[0], xs.shape[-1] - dummy_pad = paddle.zeros([batch_size, padding2, hidden], dtype=paddle.float32) + dummy_pad = paddle.zeros( + [batch_size, padding2, hidden], dtype=paddle.float32) xs = paddle.concat([xs, dummy_pad], axis=1) mask = mask[:, ::2, ::2] mask_pad = mask_pad[:, :, ::2] @@ -451,8 +481,11 @@ class TimeReductionLayerStream(nn.Layer): stride (int): Downsampling factor in time dimension. """ - def __init__(self, channel: int, out_dim: int, - kernel_size: int = 1, stride: int = 2): + def __init__(self, + channel: int, + out_dim: int, + kernel_size: int=1, + stride: int=2): super(TimeReductionLayerStream, self).__init__() self.channel = channel @@ -460,32 +493,41 @@ class TimeReductionLayerStream(nn.Layer): self.kernel_size = kernel_size self.stride = stride - self.dw_conv = Conv1D(in_channels=channel, - out_channels=channel, - kernel_size=kernel_size, - stride=stride, - padding=0, - groups=channel) - - self.pw_conv = Conv1D(in_channels=channel, - out_channels=out_dim, - kernel_size=1, - stride=1, - padding=0, - groups=1) + self.dw_conv = Conv1D( + in_channels=channel, + out_channels=channel, + kernel_size=kernel_size, + stride=stride, + padding=0, + groups=channel) + + self.pw_conv = Conv1D( + in_channels=channel, + out_channels=out_dim, + kernel_size=1, + stride=1, + padding=0, + groups=1) self.init_weights() def init_weights(self): - dw_max = self.kernel_size ** -0.5 - pw_max = self.channel ** -0.5 - self.dw_conv._param_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) - self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-dw_max, high=dw_max) - self.pw_conv._param_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) - self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(low=-pw_max, high=pw_max) - - def forward(self, xs, xs_lens: paddle.Tensor, - mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), - mask_pad: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool)): + dw_max = self.kernel_size**-0.5 + pw_max = self.channel**-0.5 + self.dw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.dw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-dw_max, high=dw_max) + self.pw_conv._param_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + self.pw_conv._bias_attr = paddle.nn.initializer.Uniform( + low=-pw_max, high=pw_max) + + def forward( + self, + xs, + xs_lens: paddle.Tensor, + mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)): xs = xs.transpose([0, 2, 1]) # [B, C, T] xs = masked_fill(xs, mask_pad.equal(0), 0.0)