From 80fc0ef71a0444c91c3caf1742a78f323e79ee3e Mon Sep 17 00:00:00 2001
From: tianhao zhang <15600919271@163.com>
Date: Mon, 29 Aug 2022 15:55:31 +0000
Subject: [PATCH] fix multigpu train

---
 paddlespeech/s2t/modules/align.py             | 39 +++++++++++++++----
 paddlespeech/s2t/modules/attention.py         | 16 ++++----
 .../s2t/modules/conformer_convolution.py      |  9 ++---
 paddlespeech/s2t/modules/decoder_layer.py     | 13 +++++--
 paddlespeech/s2t/modules/encoder.py           | 28 ++++++-------
 paddlespeech/s2t/modules/encoder_layer.py     |  3 +-
 paddlespeech/s2t/modules/initializer.py       |  1 +
 7 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/paddlespeech/s2t/modules/align.py b/paddlespeech/s2t/modules/align.py
index cacda2461..34d796145 100644
--- a/paddlespeech/s2t/modules/align.py
+++ b/paddlespeech/s2t/modules/align.py
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
+
 import paddle
 from paddle import nn
-import math
 """
     To align the initializer between paddle and torch, 
     the API below are set defalut initializer with priority higger than global initializer.
@@ -81,10 +82,18 @@ class Linear(nn.Linear):
                  name=None):
         if weight_attr is None:
             if global_init_type == "kaiming_uniform":
-                weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
+                weight_attr = paddle.ParamAttr(
+                    initializer=nn.initializer.KaimingUniform(
+                        fan_in=None,
+                        negative_slope=math.sqrt(5),
+                        nonlinearity='leaky_relu'))
         if bias_attr is None:
             if global_init_type == "kaiming_uniform":
-                bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
+                bias_attr = paddle.ParamAttr(
+                    initializer=nn.initializer.KaimingUniform(
+                        fan_in=None,
+                        negative_slope=math.sqrt(5),
+                        nonlinearity='leaky_relu'))
         super(Linear, self).__init__(in_features, out_features, weight_attr,
                                      bias_attr, name)
 
@@ -104,10 +113,18 @@ class Conv1D(nn.Conv1D):
                  data_format='NCL'):
         if weight_attr is None:
             if global_init_type == "kaiming_uniform":
-                weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
+                weight_attr = paddle.ParamAttr(
+                    initializer=nn.initializer.KaimingUniform(
+                        fan_in=None,
+                        negative_slope=math.sqrt(5),
+                        nonlinearity='leaky_relu'))
         if bias_attr is None:
             if global_init_type == "kaiming_uniform":
-                bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
+                bias_attr = paddle.ParamAttr(
+                    initializer=nn.initializer.KaimingUniform(
+                        fan_in=None,
+                        negative_slope=math.sqrt(5),
+                        nonlinearity='leaky_relu'))
         super(Conv1D, self).__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             groups, padding_mode, weight_attr, bias_attr, data_format)
@@ -128,10 +145,18 @@ class Conv2D(nn.Conv2D):
                  data_format='NCHW'):
         if weight_attr is None:
             if global_init_type == "kaiming_uniform":
-                weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
+                weight_attr = paddle.ParamAttr(
+                    initializer=nn.initializer.KaimingUniform(
+                        fan_in=None,
+                        negative_slope=math.sqrt(5),
+                        nonlinearity='leaky_relu'))
         if bias_attr is None:
             if global_init_type == "kaiming_uniform":
-                bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
+                bias_attr = paddle.ParamAttr(
+                    initializer=nn.initializer.KaimingUniform(
+                        fan_in=None,
+                        negative_slope=math.sqrt(5),
+                        nonlinearity='leaky_relu'))
         super(Conv2D, self).__init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             groups, padding_mode, weight_attr, bias_attr, data_format)
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 8c4eb9c3c..7de8ccf96 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -83,11 +83,11 @@ class MultiHeadedAttention(nn.Layer):
 
         return q, k, v
 
-    def forward_attention(self,
-            value: paddle.Tensor, 
+    def forward_attention(
+            self,
+            value: paddle.Tensor,
             scores: paddle.Tensor,
-            mask: paddle.Tensor,
-        ) -> paddle.Tensor:
+            mask: paddle.Tensor, ) -> paddle.Tensor:
         """Compute attention context vector.
         Args:
             value (paddle.Tensor): Transformed value, size
@@ -108,7 +108,7 @@ class MultiHeadedAttention(nn.Layer):
         # When will `if mask.size(2) > 0` be False?
         # 1. onnx(16/-1, -1/-1, 16/0)
         # 2. jit (16/-1, -1/-1, 16/0, 16/4)
-        if paddle.shape(mask)[2] > 0: # time2 > 0
+        if paddle.shape(mask)[2] > 0:  # time2 > 0
             mask = mask.unsqueeze(1).equal(0)  # (batch, 1, *, time2)
             # for last chunk, time2 might be larger than scores.size(-1)
             mask = mask[:, :, :, :paddle.shape(scores)[-1]]
@@ -133,8 +133,7 @@ class MultiHeadedAttention(nn.Layer):
                 value: paddle.Tensor,
                 mask: paddle.Tensor,
                 pos_emb: paddle.Tensor,
-                cache: paddle.Tensor
-                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+                cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """Compute scaled dot product attention.
        Args:
             query (paddle.Tensor): Query tensor (#batch, time1, size).
@@ -249,8 +248,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
                 value: paddle.Tensor,
                 mask: paddle.Tensor,
                 pos_emb: paddle.Tensor,
-                cache: paddle.Tensor
-                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+                cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
         Args:
             query (paddle.Tensor): Query tensor (#batch, time1, size).
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index b34f9ee1d..db062701f 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -109,8 +109,7 @@ class ConvolutionModule(nn.Layer):
     def forward(self,
                 x: paddle.Tensor,
                 mask_pad: paddle.Tensor,
-                cache: paddle.Tensor
-                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+                cache: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """Compute convolution module.
         Args:
             x (paddle.Tensor): Input tensor (#batch, time, channels).
@@ -127,11 +126,11 @@ class ConvolutionModule(nn.Layer):
         x = x.transpose([0, 2, 1])  # [B, C, T]
 
         # mask batch padding
-        if paddle.shape(mask_pad)[2] > 0: # time > 0
+        if paddle.shape(mask_pad)[2] > 0:  # time > 0
             x = x.masked_fill(mask_pad, 0.0)
 
         if self.lorder > 0:
-            if paddle.shape(cache)[2] == 0: # cache_t == 0
+            if paddle.shape(cache)[2] == 0:  # cache_t == 0
                 x = nn.functional.pad(
                     x, [self.lorder, 0], 'constant', 0.0, data_format='NCL')
             else:
@@ -161,7 +160,7 @@ class ConvolutionModule(nn.Layer):
         x = self.pointwise_conv2(x)
 
         # mask batch padding
-        if paddle.shape(mask_pad)[2] > 0: # time > 0
+        if paddle.shape(mask_pad)[2] > 0:  # time > 0
             x = x.masked_fill(mask_pad, 0.0)
 
         x = x.transpose([0, 2, 1])  # [B, T, C]
diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py
index 80518ec57..c8843b723 100644
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
@@ -122,12 +122,15 @@ class DecoderLayer(nn.Layer):
         if self.concat_after:
             tgt_concat = paddle.cat(
                 (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
-                                       paddle.empty([0]), paddle.zeros([0,0,0,0]))[0]), dim=-1)
+                                       paddle.empty([0]),
+                                       paddle.zeros([0, 0, 0, 0]))[0]),
+                dim=-1)
             x = residual + self.concat_linear1(tgt_concat)
         else:
             x = residual + self.dropout(
                 self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
-                               paddle.empty([0]), paddle.zeros([0,0,0,0]))[0])
+                               paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[
+                                   0])
         if not self.normalize_before:
             x = self.norm1(x)
 
@@ -137,12 +140,14 @@ class DecoderLayer(nn.Layer):
         if self.concat_after:
             x_concat = paddle.cat(
                 (x, self.src_attn(x, memory, memory, memory_mask,
-                                  paddle.empty([0]), paddle.zeros([0,0,0,0]))[0]), dim=-1)
+                                  paddle.empty([0]),
+                                  paddle.zeros([0, 0, 0, 0]))[0]),
+                dim=-1)
             x = residual + self.concat_linear2(x_concat)
         else:
             x = residual + self.dropout(
                 self.src_attn(x, memory, memory, memory_mask,
-                              paddle.empty([0]), paddle.zeros([0,0,0,0]))[0])
+                              paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0])
         if not self.normalize_before:
             x = self.norm2(x)
 
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 5bc1b20d3..38139937b 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -177,8 +177,9 @@ class BaseEncoder(nn.Layer):
             decoding_chunk_size, self.static_chunk_size,
             num_decoding_left_chunks)
         for layer in self.encoders:
-            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad, 
-                                          paddle.zeros([0, 0, 0, 0]), paddle.zeros([0, 0, 0, 0]))
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad,
+                                          paddle.zeros([0, 0, 0, 0]),
+                                          paddle.zeros([0, 0, 0, 0]))
         if self.normalize_before:
             xs = self.after_norm(xs)
         # Here we assume the mask is not changed in encoder layers, so just
@@ -228,7 +229,7 @@ class BaseEncoder(nn.Layer):
             xs = self.global_cmvn(xs)
 
         # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) 
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
         # after embed, xs=(B=1, chunk_size, hidden-dim)
 
         elayers = paddle.shape(att_cache)[0]
@@ -253,14 +254,16 @@ class BaseEncoder(nn.Layer):
             # att_cache[i:i+1] = (1, head, cache_t1, d_k*2)
             # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2)
             xs, _, new_att_cache, new_cnn_cache = layer(
-                xs, att_mask, pos_emb,
-                att_cache=att_cache[i:i+1] if elayers > 0 else att_cache,
-                cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache,
-            )
+                xs,
+                att_mask,
+                pos_emb,
+                att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
+                cnn_cache=cnn_cache[i:i + 1]
+                if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, )
             # new_att_cache = (1, head, attention_key_size, d_k*2)
             # new_cnn_cache = (B=1, hidden-dim, cache_t2)
-            r_att_cache.append(new_att_cache[:,:, next_cache_start:, :])
-            r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim
+            r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
+            r_cnn_cache.append(new_cnn_cache.unsqueeze(0))  # add elayer dim
 
         if self.normalize_before:
             xs = self.after_norm(xs)
@@ -271,7 +274,6 @@ class BaseEncoder(nn.Layer):
         r_cnn_cache = paddle.concat(r_cnn_cache, axis=0)
         return xs, r_att_cache, r_cnn_cache
 
-
     def forward_chunk_by_chunk(
             self,
             xs: paddle.Tensor,
@@ -316,8 +318,8 @@ class BaseEncoder(nn.Layer):
         num_frames = xs.shape[1]
         required_cache_size = decoding_chunk_size * num_decoding_left_chunks
 
-        att_cache: paddle.Tensor = paddle.zeros([0,0,0,0])
-        cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0])
+        att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0])
+        cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0])
 
         outputs = []
         offset = 0
@@ -327,7 +329,7 @@ class BaseEncoder(nn.Layer):
             chunk_xs = xs[:, cur:end, :]
 
             (y, att_cache, cnn_cache) = self.forward_chunk(
-                 chunk_xs, offset, required_cache_size, att_cache, cnn_cache)
+                chunk_xs, offset, required_cache_size, att_cache, cnn_cache)
 
             outputs.append(y)
             offset += y.shape[1]
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index 674e72a30..8fd991ec6 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -105,7 +105,8 @@ class TransformerEncoderLayer(nn.Layer):
         if self.normalize_before:
             x = self.norm1(x)
 
-        x_att, new_att_cache = self.self_attn(x, x, x, mask, paddle.empty([0]), cache=att_cache)
+        x_att, new_att_cache = self.self_attn(
+            x, x, x, mask, paddle.empty([0]), cache=att_cache)
 
         if self.concat_after:
             x_concat = paddle.concat((x, x_att), axis=-1)
diff --git a/paddlespeech/s2t/modules/initializer.py b/paddlespeech/s2t/modules/initializer.py
index cdcf2e052..e37837d2f 100644
--- a/paddlespeech/s2t/modules/initializer.py
+++ b/paddlespeech/s2t/modules/initializer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import numpy as np
 
+
 class DefaultInitializerContext(object):
     """
         egs: