From fe8bbcc226f422ede56199563f000f722e16c7c3 Mon Sep 17 00:00:00 2001 From: yeyupiaoling Date: Wed, 11 Jan 2023 10:54:45 +0800 Subject: [PATCH] remove rel_shift, test=asr --- examples/aishell/asr1/conf/chunk_squeezeformer.yaml | 1 - examples/aishell/asr1/conf/squeezeformer.yaml | 2 +- paddlespeech/s2t/modules/attention.py | 5 +---- paddlespeech/s2t/modules/encoder.py | 5 +---- 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/examples/aishell/asr1/conf/chunk_squeezeformer.yaml b/examples/aishell/asr1/conf/chunk_squeezeformer.yaml index 45a2ac965..35a90b7d6 100644 --- a/examples/aishell/asr1/conf/chunk_squeezeformer.yaml +++ b/examples/aishell/asr1/conf/chunk_squeezeformer.yaml @@ -21,7 +21,6 @@ encoder_conf: normalize_before: false activation_type: 'swish' pos_enc_layer_type: 'rel_pos' - do_rel_shift: false time_reduction_layer_type: 'stream' causal: true use_dynamic_chunk: true diff --git a/examples/aishell/asr1/conf/squeezeformer.yaml b/examples/aishell/asr1/conf/squeezeformer.yaml index 49a837a82..b7841aca5 100644 --- a/examples/aishell/asr1/conf/squeezeformer.yaml +++ b/examples/aishell/asr1/conf/squeezeformer.yaml @@ -21,7 +21,7 @@ encoder_conf: normalize_before: false activation_type: 'swish' pos_enc_layer_type: 'rel_pos' - time_reduction_layer_type: 'conv2d' + time_reduction_layer_type: 'conv1d' # decoder related decoder: transformer diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 43700ca1e..14336c03d 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -204,7 +204,6 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): n_head, n_feat, dropout_rate, - do_rel_shift=False, adaptive_scale=False, init_weights=False): """Construct an RelPositionMultiHeadedAttention object. @@ -229,7 +228,6 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): pos_bias_v = self.create_parameter( (self.h, self.d_k), default_initializer=I.XavierUniform()) self.add_parameter('pos_bias_v', pos_bias_v) - self.do_rel_shift = do_rel_shift self.adaptive_scale = adaptive_scale if self.adaptive_scale: ada_scale = self.create_parameter( @@ -369,8 +367,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): matrix_bd = paddle.matmul(q_with_bias_v, p, transpose_y=True) # Remove rel_shift since it is useless in speech recognition, # and it requires special attention for streaming. - if self.do_rel_shift: - matrix_bd = self.rel_shift(matrix_bd) + # matrix_bd = self.rel_shift(matrix_bd) scores = (matrix_ac + matrix_bd) / math.sqrt( self.d_k) # (batch, head, time1, time2) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 7be192575..d90d69d77 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -515,7 +515,6 @@ class SqueezeformerEncoder(nn.Layer): input_dropout_rate: float=0.1, pos_enc_layer_type: str="rel_pos", time_reduction_layer_type: str="conv1d", - do_rel_shift: bool=True, feed_forward_dropout_rate: float=0.1, attention_dropout_rate: float=0.1, cnn_module_kernel: int=31, @@ -549,8 +548,6 @@ class SqueezeformerEncoder(nn.Layer): input_dropout_rate (float): Dropout rate of input projection layer. pos_enc_layer_type (str): Self attention type. time_reduction_layer_type (str): Conv1d or Conv2d reduction layer. - do_rel_shift (bool): Whether to do relative shift - operation on rel-attention module. cnn_module_kernel (int): Kernel size of CNN module. activation_type (str): Encoder activation function type. cnn_module_kernel (int): Kernel size of convolution module. @@ -590,7 +587,7 @@ class SqueezeformerEncoder(nn.Layer): else: encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = (attention_heads, encoder_dim, - attention_dropout_rate, do_rel_shift, + attention_dropout_rate, adaptive_scale, init_weights) # feed-forward module definition