From e6b23ae0c5de9c0c8fa2be26eca1489cf96413a5 Mon Sep 17 00:00:00 2001 From: tianhao zhang <15600919271@163.com> Date: Mon, 29 Aug 2022 10:15:27 +0000 Subject: [PATCH] fixed multi-gpu training --- paddlespeech/s2t/models/u2/u2.py | 4 ++-- paddlespeech/s2t/modules/attention.py | 14 +++++++------- paddlespeech/s2t/modules/conformer_convolution.py | 4 ++-- paddlespeech/s2t/modules/decoder_layer.py | 12 ++++++++---- paddlespeech/s2t/modules/encoder.py | 9 +++++---- paddlespeech/s2t/modules/encoder_layer.py | 14 +++++++------- 6 files changed, 31 insertions(+), 26 deletions(-) diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index e19f411cf..a812abcbd 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + att_cache: paddle.Tensor, + cnn_cache: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index b6d615867..8c4eb9c3c 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -86,7 +86,7 @@ class MultiHeadedAttention(nn.Layer): def forward_attention(self, value: paddle.Tensor, scores: paddle.Tensor, - mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), + mask: paddle.Tensor, ) -> paddle.Tensor: """Compute attention context vector. Args: @@ -131,9 +131,9 @@ class MultiHeadedAttention(nn.Layer): query: paddle.Tensor, key: paddle.Tensor, value: paddle.Tensor, - mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool), - pos_emb: paddle.Tensor = paddle.empty([0]), - cache: paddle.Tensor = paddle.zeros([0,0,0,0]) + mask: paddle.Tensor, + pos_emb: paddle.Tensor, + cache: paddle.Tensor ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute scaled dot product attention. Args: @@ -247,9 +247,9 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): query: paddle.Tensor, key: paddle.Tensor, value: paddle.Tensor, - mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool), - pos_emb: paddle.Tensor = paddle.empty([0]), - cache: paddle.Tensor = paddle.zeros([0,0,0,0]) + mask: paddle.Tensor, + pos_emb: paddle.Tensor, + cache: paddle.Tensor ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index c384b9c78..b34f9ee1d 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -108,8 +108,8 @@ class ConvolutionModule(nn.Layer): def forward(self, x: paddle.Tensor, - mask_pad: paddle.Tensor= paddle.ones([0,0,0], dtype=paddle.bool), - cache: paddle.Tensor= paddle.zeros([0,0,0]), + mask_pad: paddle.Tensor, + cache: paddle.Tensor ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute convolution module. Args: diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index 37b124e84..80518ec57 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -121,11 +121,13 @@ class DecoderLayer(nn.Layer): if self.concat_after: tgt_concat = paddle.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) + (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, + paddle.empty([0]), paddle.zeros([0,0,0,0]))[0]), dim=-1) x = residual + self.concat_linear1(tgt_concat) else: x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) + self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, + paddle.empty([0]), paddle.zeros([0,0,0,0]))[0]) if not self.normalize_before: x = self.norm1(x) @@ -134,11 +136,13 @@ class DecoderLayer(nn.Layer): x = self.norm2(x) if self.concat_after: x_concat = paddle.cat( - (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) + (x, self.src_attn(x, memory, memory, memory_mask, + paddle.empty([0]), paddle.zeros([0,0,0,0]))[0]), dim=-1) x = residual + self.concat_linear2(x_concat) else: x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask)[0]) + self.src_attn(x, memory, memory, memory_mask, + paddle.empty([0]), paddle.zeros([0,0,0,0]))[0]) if not self.normalize_before: x = self.norm2(x) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index bff2d69bb..5bc1b20d3 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -177,7 +177,8 @@ class BaseEncoder(nn.Layer): decoding_chunk_size, self.static_chunk_size, num_decoding_left_chunks) for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad, + paddle.zeros([0, 0, 0, 0]), paddle.zeros([0, 0, 0, 0])) if self.normalize_before: xs = self.after_norm(xs) # Here we assume the mask is not changed in encoder layers, so just @@ -190,9 +191,9 @@ class BaseEncoder(nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor = paddle.zeros([0,0,0,0]), - cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0]), - att_mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool), + att_cache: paddle.Tensor, + cnn_cache: paddle.Tensor, + att_mask: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Forward just one chunk Args: diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 5f810dfde..674e72a30 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -76,9 +76,9 @@ class TransformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), - att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + mask_pad: paddle.Tensor, + att_cache: paddle.Tensor, + cnn_cache: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: @@ -105,7 +105,7 @@ class TransformerEncoderLayer(nn.Layer): if self.normalize_before: x = self.norm1(x) - x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache) + x_att, new_att_cache = self.self_attn(x, x, x, mask, paddle.empty([0]), cache=att_cache) if self.concat_after: x_concat = paddle.concat((x, x_att), axis=-1) @@ -193,9 +193,9 @@ class ConformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), - att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + mask_pad: paddle.Tensor, + att_cache: paddle.Tensor, + cnn_cache: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: