diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
index 124649987..6c7e75c1f 100644
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -13,19 +13,3 @@
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
index 3457f51a8..688bf5f84 100644
--- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
@@ -1114,6 +1114,7 @@ class Wav2Vec2Model(nn.Layer):
 
 class Wav2Vec2ConfigPure():
     model_type = "wav2vec2"
+
     def __init__(self, config):
         self.output_attentions = False
         self.output_hidden_states = False
diff --git a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
index 6b9d6cb30..c610b22d7 100644
--- a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
@@ -2,16 +2,20 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-
 # S3PRL has no contribution to this file
 # The file was copied from fairseq to remove the dependency on the entire fairseq package
-
 import logging
 import math
 import uuid
-from dataclasses import dataclass, field
-from enum import Enum, EnumMeta
-from typing import Callable, Dict, List, Optional, Tuple
+from dataclasses import dataclass
+from dataclasses import field
+from enum import Enum
+from enum import EnumMeta
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
 
 import numpy as np
 import paddle
@@ -22,7 +26,6 @@ from paddle import Tensor
 logger = logging.getLogger(__name__)
 
 
-
 class GLU(nn.Layer):
     r"""Applies the gated linear unit function
     :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half
@@ -42,13 +45,15 @@ class GLU(nn.Layer):
         >>> input = paddle.randn([4, 2])
         >>> output = m(input)
     """
-    def __init__(self, axis: int = -1) -> None:
+
+    def __init__(self, axis: int=-1) -> None:
         super().__init__()
         self.axis = axis
 
     def forward(self, input: Tensor) -> Tensor:
         return F.glu(input, self.axis)
 
+
 class FairseqIncrementalState(object):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -61,10 +66,9 @@ class FairseqIncrementalState(object):
         return "{}.{}".format(self._incremental_state_id, key)
 
     def get_incremental_state(
-        self,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
-        key: str,
-    ) -> Optional[Dict[str, Optional[Tensor]]]:
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+            key: str, ) -> Optional[Dict[str, Optional[Tensor]]]:
         """Helper for getting incremental state for an nn.Layer."""
         full_key = self._get_full_incremental_state_key(key)
         if incremental_state is None or full_key not in incremental_state:
@@ -72,10 +76,10 @@ class FairseqIncrementalState(object):
         return incremental_state[full_key]
 
     def set_incremental_state(
-        self,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
-        key: str,
-        value: Dict[str, Optional[Tensor]],
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+            key: str,
+            value: Dict[str, Optional[Tensor]],
     ) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
         """Helper for setting incremental state for an nn.Layer."""
         if incremental_state is not None:
@@ -85,9 +89,8 @@ class FairseqIncrementalState(object):
 
 
 def with_incremental_state(cls):
-    cls.__bases__ = (FairseqIncrementalState,) + tuple(
-        b for b in cls.__bases__ if b != FairseqIncrementalState
-    )
+    cls.__bases__ = (FairseqIncrementalState, ) + tuple(
+        b for b in cls.__bases__ if b != FairseqIncrementalState)
     return cls
 
 
@@ -105,25 +108,21 @@ class FairseqDropout(paddle.nn.Layer):
             return x
 
     def make_generation_fast_(
-        self,
-        name: str,
-        retain_dropout: bool = False,
-        retain_dropout_modules: Optional[List[str]] = None,
-        **kwargs,
-    ):
+            self,
+            name: str,
+            retain_dropout: bool=False,
+            retain_dropout_modules: Optional[List[str]]=None,
+            **kwargs, ):
         if retain_dropout:
             if retain_dropout_modules is not None and self.module_name is None:
                 logger.warning(
                     "Cannot enable dropout during inference for module {} "
-                    "because module_name was not set".format(name)
-                )
-            elif (
-                retain_dropout_modules is None  # if None, apply to all modules
-                or self.module_name in retain_dropout_modules
-            ):
-                logger.info(
-                    "Enabling dropout during inference for module: {}".format(name)
-                )
+                    "because module_name was not set".format(name))
+            elif (retain_dropout_modules is
+                  None  # if None, apply to all modules
+                  or self.module_name in retain_dropout_modules):
+                logger.info("Enabling dropout during inference for module: {}".
+                            format(name))
                 self.apply_during_inference = True
             else:
                 logger.info("Disabling dropout for module: {}".format(name))
@@ -162,16 +161,15 @@ def quant_noise(module, p, block_size):
     # 2D matrix
     if not is_conv:
         assert (
-            module.weight.shape[1] % block_size == 0
-        ), "Input features must be a multiple of block sizes"
+            module.weight.shape[1] %
+            block_size == 0), "Input features must be a multiple of block sizes"
 
     # 4D matrix
     else:
         # 1x1 convolutions
         if module.weight.shape[2:] == (1, 1):
-            assert (
-                module.weight.shape[1] % block_size == 0
-            ), "Input channels must be a multiple of block sizes"
+            assert (module.weight.shape[1] % block_size == 0
+                    ), "Input channels must be a multiple of block sizes"
         # regular convolutions
         else:
             k = module.weight.shape[2] * module.weight.shape[3]
@@ -188,9 +186,11 @@ def quant_noise(module, p, block_size):
 
                 # split weight matrix into blocks and randomly drop selected blocks
                 mask = paddle.zeros(
-                    [in_features // block_size * out_features], dtype=paddle.bool)
+                    [in_features // block_size * out_features],
+                    dtype=paddle.bool)
                 mask.bernoulli_(p)
-                mask = mask.unsqueeze(1).tile([1, block_size]).reshape([-1, in_features])
+                mask = mask.unsqueeze(1).tile([1, block_size]).reshape(
+                    [-1, in_features])
 
             else:
                 # gather weight and sizes
@@ -201,14 +201,13 @@ def quant_noise(module, p, block_size):
                 # split weight matrix into blocks and randomly drop selected blocks
                 if module.weight.shape[2:] == (1, 1):
                     mask = paddle.zeros(
-                        [in_channels // block_size * out_channels], dtype=paddle.bool
-                    )
+                        [in_channels // block_size * out_channels],
+                        dtype=paddle.bool)
                     mask.bernoulli_(p)
-                    mask = mask.unsqueeze(1).tile([1, block_size]).reshape([-1, in_channels])
+                    mask = mask.unsqueeze(1).tile([1, block_size]).reshape(
+                        [-1, in_channels])
                 else:
-                    mask = paddle.zeros(
-                        weight.shape
-                    )
+                    mask = paddle.zeros(weight.shape)
                     mask.bernoulli_(p)
                     mask = mask.unsqueeze(1).tile([1, in_channels, 1, 1])
 
@@ -228,28 +227,26 @@ class MultiheadAttention(nn.Layer):
     """
 
     def __init__(
-        self,
-        embed_dim,
-        num_heads,
-        kdim=None,
-        vdim=None,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        self_attention=False,
-        encoder_decoder_attention=False,
-        q_noise=0.0,
-        qn_block_size=8,
-        # TODO: pass in config rather than string.
-        # config defined in xformers.components.attention.AttentionConfig
-        xformers_att_config: Optional[str] = None,
-        xformers_blocksparse_layout: Optional[
-            paddle.Tensor
-        ] = None,  # This should be part of the config
-        xformers_blocksparse_blocksize: Optional[
-            int
-        ] = 16,  # This should be part of the config
+            self,
+            embed_dim,
+            num_heads,
+            kdim=None,
+            vdim=None,
+            dropout=0.0,
+            bias=True,
+            add_bias_kv=False,
+            add_zero_attn=False,
+            self_attention=False,
+            encoder_decoder_attention=False,
+            q_noise=0.0,
+            qn_block_size=8,
+            # TODO: pass in config rather than string.
+            # config defined in xformers.components.attention.AttentionConfig
+            xformers_att_config: Optional[str]=None,
+            xformers_blocksparse_layout: Optional[
+                paddle.Tensor]=None,  # This should be part of the config
+            xformers_blocksparse_blocksize: Optional[
+                int]=16,  # This should be part of the config
     ):
         super().__init__()
 
@@ -271,22 +268,20 @@ class MultiheadAttention(nn.Layer):
 
         self.num_heads = num_heads
         self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
+            dropout, module_name=self.__class__.__name__)
 
         self.head_dim = embed_dim // num_heads
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
+        assert (self.head_dim * num_heads == self.embed_dim
+                ), "embed_dim must be divisible by num_heads"
         self.scaling = self.head_dim**-0.5
 
         self.self_attention = self_attention
         self.encoder_decoder_attention = encoder_decoder_attention
 
         assert not self.self_attention or self.qkv_same_dim, (
-            "Self-attention requires query, key and " "value to be of the same size"
-        )
-        
+            "Self-attention requires query, key and "
+            "value to be of the same size")
+
         weight_attr = paddle.ParamAttr(initializer=nn.initializer.XavierUniform)
         bias_attr = nn.initializer.Constant(0)
         # self.k_proj = quant_noise(
@@ -303,16 +298,22 @@ class MultiheadAttention(nn.Layer):
         #     nn.Linear(embed_dim, embed_dim, weight_attr=weight_attr, bias_attr=bias if not bias else bias_attr), q_noise, qn_block_size
         # )
         self.k_proj = nn.Linear(self.kdim, embed_dim)
-        
+
         self.v_proj = nn.Linear(self.vdim, embed_dim)
-        
+
         self.q_proj = nn.Linear(embed_dim, embed_dim)
 
         self.out_proj = nn.Linear(embed_dim, embed_dim)
 
         if add_bias_kv:
-            self.bias_k = paddle.create_parameter(shape=[1, 1, embed_dim], dtype='float32', initializer=nn.initializer.XavierUniform)
-            self.bias_v = paddle.create_parameter(shape=[1, 1, embed_dim], dtype='float32', initializer=nn.initializer.XavierUniform)
+            self.bias_k = paddle.create_parameter(
+                shape=[1, 1, embed_dim],
+                dtype='float32',
+                initializer=nn.initializer.XavierUniform)
+            self.bias_v = paddle.create_parameter(
+                shape=[1, 1, embed_dim],
+                dtype='float32',
+                initializer=nn.initializer.XavierUniform)
         else:
             self.bias_k = self.bias_v = None
 
@@ -327,25 +328,25 @@ class MultiheadAttention(nn.Layer):
         self.onnx_trace = True
 
     # def reset_parameters(self):
-        # if self.qkv_same_dim:
-        #     # Empirically observed the convergence to be much better with
-        #     # the scaled initialization
-        #     nn.initializer.XavierUniform(self.k_proj.weight, gain=1 / math.sqrt(2))
-        #     nn.initializer.XavierUniform(self.v_proj.weight, gain=1 / math.sqrt(2))
-        #     nn.initializer.XavierUniform(self.q_proj.weight, gain=1 / math.sqrt(2))
+    # if self.qkv_same_dim:
+    #     # Empirically observed the convergence to be much better with
+    #     # the scaled initialization
+    #     nn.initializer.XavierUniform(self.k_proj.weight, gain=1 / math.sqrt(2))
+    #     nn.initializer.XavierUniform(self.v_proj.weight, gain=1 / math.sqrt(2))
+    #     nn.initializer.XavierUniform(self.q_proj.weight, gain=1 / math.sqrt(2))
     # else:
-        # self.k_proj.weight = paddle.ParamAttr()
-        # nn.initializer.XavierUniform(self.k_proj.weight)
-        # nn.initializer.XavierUniform(self.v_proj.weight)
-        # nn.initializer.XavierUniform(self.q_proj.weight)
-
-        # nn.initializer.XavierUniform(self.out_proj.weight)
-        # if self.out_proj.bias is not None:
-        #     nn.initializer.Constant(self.out_proj.bias)
-        # if self.bias_k is not None:
-        #     nn.initializer.XavierNormal(self.bias_k)
-        # if self.bias_v is not None:
-        #     nn.initializer.XavierNormal(self.bias_v)
+    # self.k_proj.weight = paddle.ParamAttr()
+    # nn.initializer.XavierUniform(self.k_proj.weight)
+    # nn.initializer.XavierUniform(self.v_proj.weight)
+    # nn.initializer.XavierUniform(self.q_proj.weight)
+
+    # nn.initializer.XavierUniform(self.out_proj.weight)
+    # if self.out_proj.bias is not None:
+    #     nn.initializer.Constant(self.out_proj.bias)
+    # if self.bias_k is not None:
+    #     nn.initializer.XavierNormal(self.bias_k)
+    # if self.bias_v is not None:
+    #     nn.initializer.XavierNormal(self.bias_v)
 
     def _get_reserve_head_index(self, num_heads_to_keep: int):
         k_proj_heads_norm = []
@@ -356,45 +357,25 @@ class MultiheadAttention(nn.Layer):
             start_idx = i * self.head_dim
             end_idx = (i + 1) * self.head_dim
             k_proj_heads_norm.append(
-                paddle.sum(
-                    paddle.abs(
-                        self.k_proj.weight[
-                            start_idx:end_idx,
-                        ]
-                    )
-                ).tolist()
-                + paddle.sum(paddle.abs(self.k_proj.bias[start_idx:end_idx])).tolist()
-            )
+                paddle.sum(paddle.abs(self.k_proj.weight[start_idx:end_idx, ]))
+                .tolist() + paddle.sum(
+                    paddle.abs(self.k_proj.bias[start_idx:end_idx])).tolist())
             q_proj_heads_norm.append(
-                paddle.sum(
-                    paddle.abs(
-                        self.q_proj.weight[
-                            start_idx:end_idx,
-                        ]
-                    )
-                ).tolist()
-                + paddle.sum(paddle.abs(self.q_proj.bias[start_idx:end_idx])).tolist()
-            )
+                paddle.sum(paddle.abs(self.q_proj.weight[start_idx:end_idx, ]))
+                .tolist() + paddle.sum(
+                    paddle.abs(self.q_proj.bias[start_idx:end_idx])).tolist())
             v_proj_heads_norm.append(
-                paddle.sum(
-                    paddle.abs(
-                        self.v_proj.weight[
-                            start_idx:end_idx,
-                        ]
-                    )
-                ).tolist()
-                + paddle.sum(paddle.abs(self.v_proj.bias[start_idx:end_idx])).tolist()
-            )
+                paddle.sum(paddle.abs(self.v_proj.weight[start_idx:end_idx, ]))
+                .tolist() + paddle.sum(
+                    paddle.abs(self.v_proj.bias[start_idx:end_idx])).tolist())
 
         heads_norm = []
         for i in range(self.num_heads):
-            heads_norm.append(
-                k_proj_heads_norm[i] + q_proj_heads_norm[i] + v_proj_heads_norm[i]
-            )
+            heads_norm.append(k_proj_heads_norm[i] + q_proj_heads_norm[i] +
+                              v_proj_heads_norm[i])
 
         sorted_head_index = sorted(
-            range(self.num_heads), key=lambda k: heads_norm[k], reverse=True
-        )
+            range(self.num_heads), key=lambda k: heads_norm[k], reverse=True)
         reserve_head_index = []
         for i in range(num_heads_to_keep):
             start = sorted_head_index[i] * self.head_dim
@@ -414,40 +395,29 @@ class MultiheadAttention(nn.Layer):
 
         for ele in reserve_head_index:
             start_idx, end_idx = ele
-            new_q_weight.append(
-                self.q_proj.weight[
-                    start_idx:end_idx,
-                ]
-            )
+            new_q_weight.append(self.q_proj.weight[start_idx:end_idx, ])
             new_q_bias.append(self.q_proj.bias[start_idx:end_idx])
 
-            new_k_weight.append(
-                self.k_proj.weight[
-                    start_idx:end_idx,
-                ]
-            )
+            new_k_weight.append(self.k_proj.weight[start_idx:end_idx, ])
 
             new_k_bias.append(self.k_proj.bias[start_idx:end_idx])
 
-            new_v_weight.append(
-                self.v_proj.weight[
-                    start_idx:end_idx,
-                ]
-            )
+            new_v_weight.append(self.v_proj.weight[start_idx:end_idx, ])
             new_v_bias.append(self.v_proj.bias[start_idx:end_idx])
 
-            new_out_proj_weight.append(self.out_proj.weight[:, start_idx:end_idx])
+            new_out_proj_weight.append(
+                self.out_proj.weight[:, start_idx:end_idx])
 
         new_q_weight = paddle.concat(new_q_weight).detach()
         new_k_weight = paddle.concat(new_k_weight).detach()
         new_v_weight = paddle.concat(new_v_weight).detach()
-        new_out_proj_weight = paddle.concat(new_out_proj_weight, axis=-1).detach()
+        new_out_proj_weight = paddle.concat(
+            new_out_proj_weight, axis=-1).detach()
         new_q_weight.stop_gradient = False
         new_k_weight.stop_gradient = False
         new_v_weight.stop_gradient = False
         new_out_proj_weight.stop_gradient = False
 
-
         new_q_bias = paddle.concat(new_q_bias).detach()
         new_q_bias.stop_gradient = False
 
@@ -457,16 +427,38 @@ class MultiheadAttention(nn.Layer):
         new_v_bias = paddle.concat(new_v_bias).detach()
         new_v_bias.stop_gradient = False
 
-        self.q_proj.weight = paddle.create_parameter(shape=new_q_weight.shape, dtype=new_q_weight.dtype, default_initializer=paddle.nn.initializer.Assign(new_q_weight))
-        self.q_proj.bias = paddle.create_parameter(shape=new_q_bias.shape, dtype=new_q_bias.dtype, default_initializer=paddle.nn.initializer.Assign(new_q_bias))
-
-        self.k_proj.weight = paddle.create_parameter(shape=new_k_weight.shape, dtype=new_k_weight.dtype, default_initializer=paddle.nn.initializer.Assign(new_k_weight))
-        self.k_proj.bias = paddle.create_parameter(shape=new_k_bias.shape, dtype=new_k_bias.dtype, default_initializer=paddle.nn.initializer.Assign(new_k_bias))
-
-        self.v_proj.weight = paddle.create_parameter(shape=new_v_weight.shape, dtype=new_v_weight.dtype, default_initializer=paddle.nn.initializer.Assign(new_v_weight))
-        self.v_proj.bias = paddle.create_parameter(shape=new_v_bias.shape, dtype=new_v_bias.dtype, default_initializer=paddle.nn.initializer.Assign(new_v_bias))
-
-        self.out_proj.weight = paddle.create_parameter(shape=new_out_proj_weight.shape, dtype=new_out_proj_weight.dtype, default_initializer=paddle.nn.initializer.Assign(new_out_proj_weight))
+        self.q_proj.weight = paddle.create_parameter(
+            shape=new_q_weight.shape,
+            dtype=new_q_weight.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_q_weight))
+        self.q_proj.bias = paddle.create_parameter(
+            shape=new_q_bias.shape,
+            dtype=new_q_bias.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_q_bias))
+
+        self.k_proj.weight = paddle.create_parameter(
+            shape=new_k_weight.shape,
+            dtype=new_k_weight.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_k_weight))
+        self.k_proj.bias = paddle.create_parameter(
+            shape=new_k_bias.shape,
+            dtype=new_k_bias.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_k_bias))
+
+        self.v_proj.weight = paddle.create_parameter(
+            shape=new_v_weight.shape,
+            dtype=new_v_weight.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_v_weight))
+        self.v_proj.bias = paddle.create_parameter(
+            shape=new_v_bias.shape,
+            dtype=new_v_bias.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_v_bias))
+
+        self.out_proj.weight = paddle.create_parameter(
+            shape=new_out_proj_weight.shape,
+            dtype=new_out_proj_weight.dtype,
+            default_initializer=paddle.nn.initializer.Assign(
+                new_out_proj_weight))
 
         self.num_heads = len(reserve_head_index)
         self.embed_dim = self.head_dim * self.num_heads
@@ -478,67 +470,74 @@ class MultiheadAttention(nn.Layer):
         self.skip_embed_dim_check = True
 
     def _pad_masks(
-        self,
-        key_padding_mask: Optional[Tensor],
-        attn_mask: Optional[Tensor],
+            self,
+            key_padding_mask: Optional[Tensor],
+            attn_mask: Optional[Tensor],
     ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
         if attn_mask is not None:
-            shape = attn_mask.shape[:-1] + [1,]
-            attn_mask = paddle.concat([attn_mask, paddle.zeros(shape, dtype=attn_mask.dtype)], axis=-1)
+            shape = attn_mask.shape[:-1] + [
+                1,
+            ]
+            attn_mask = paddle.concat(
+                [attn_mask, paddle.zeros(shape, dtype=attn_mask.dtype)],
+                axis=-1)
         if key_padding_mask is not None:
-            shape = key_padding_mask.shape[:-1] + [1,]
-            key_padding_mask = paddle.concat([key_padding_mask, paddle.zeros(shape, dtype=key_padding_mask.dtype)], axis=-1)
+            shape = key_padding_mask.shape[:-1] + [
+                1,
+            ]
+            key_padding_mask = paddle.concat(
+                [
+                    key_padding_mask, paddle.zeros(
+                        shape, dtype=key_padding_mask.dtype)
+                ],
+                axis=-1)
         return key_padding_mask, attn_mask
 
     def _add_bias(
-        self,
-        k: Tensor,
-        v: Tensor,
-        key_padding_mask: Optional[Tensor],
-        attn_mask: Optional[Tensor],
-        bsz: int,
+            self,
+            k: Tensor,
+            v: Tensor,
+            key_padding_mask: Optional[Tensor],
+            attn_mask: Optional[Tensor],
+            bsz: int,
     ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
         assert self.bias_k is not None
         assert self.bias_v is not None
         k = paddle.concat([k, self.bias_k.tile([1, bsz, 1])], axis=-1)
         v = paddle.concat([v, self.bias_v.tile([1, bsz, 1])], axis=-1)
         key_padding_mask, attn_mask = self._pad_masks(
-            key_padding_mask=key_padding_mask, attn_mask=attn_mask
-        )
+            key_padding_mask=key_padding_mask, attn_mask=attn_mask)
         return k, v, key_padding_mask, attn_mask
 
     def _append_zero_attn(
-        self,
-        k: Tensor,
-        v: Tensor,
-        key_padding_mask: Optional[Tensor],
-        attn_mask: Optional[Tensor],
+            self,
+            k: Tensor,
+            v: Tensor,
+            key_padding_mask: Optional[Tensor],
+            attn_mask: Optional[Tensor],
     ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
         zero_attn_shape = k.shape[:-2] + [1] + k.shape[-1:]
         k = paddle.concat(
-            [k, paddle.zeros(zero_attn_shape, dtype=k.dtype)], axis=-2
-        )
+            [k, paddle.zeros(zero_attn_shape, dtype=k.dtype)], axis=-2)
         v = paddle.concat(
-            [v, paddle.zeros(zero_attn_shape, dtype=v.dtype)], axis=-2
-        )
+            [v, paddle.zeros(zero_attn_shape, dtype=v.dtype)], axis=-2)
         key_padding_mask, attn_mask = self._pad_masks(
-            key_padding_mask=key_padding_mask, attn_mask=attn_mask
-        )
+            key_padding_mask=key_padding_mask, attn_mask=attn_mask)
         return k, v, key_padding_mask, attn_mask
 
     def forward(
-        self,
-        query,
-        key: Optional[Tensor],
-        value: Optional[Tensor],
-        key_padding_mask: Optional[Tensor] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        need_weights: bool = True,
-        static_kv: bool = False,
-        attn_mask: Optional[Tensor] = None,
-        before_softmax: bool = False,
-        need_head_weights: bool = False,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
+            self,
+            query,
+            key: Optional[Tensor],
+            value: Optional[Tensor],
+            key_padding_mask: Optional[Tensor]=None,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[
+                Tensor]]]]=None,
+            need_weights: bool=True,
+            static_kv: bool=False,
+            attn_mask: Optional[Tensor]=None,
+            before_softmax: bool=False,
+            need_head_weights: bool=False, ) -> Tuple[Tensor, Optional[Tensor]]:
         """Input shape: Time x Batch x Channel
 
         Args:
@@ -564,9 +563,8 @@ class MultiheadAttention(nn.Layer):
         tgt_len, bsz, embed_dim = query.shape
         src_len = tgt_len
         if not self.skip_embed_dim_check:
-            assert (
-                embed_dim == self.embed_dim
-            ), f"query dim {embed_dim} != {self.embed_dim}"
+            assert (embed_dim == self.embed_dim
+                    ), f"query dim {embed_dim} != {self.embed_dim}"
         assert list(query.shape) == [tgt_len, bsz, embed_dim]
         # if key is not None:
         #     src_len, key_bsz, _ = key.size()
@@ -590,35 +588,35 @@ class MultiheadAttention(nn.Layer):
         # ):
         #     assert key is not None and value is not None
 
-            # if self.use_xformers:
-            #     return self._xformers_attn_forward(
-            #         query, key, value, key_padding_mask, need_weights, attn_mask
-            #     )
-
-            # else:
-            #     return F.multi_head_attention_forward(
-            #         query,
-            #         key,
-            #         value,
-            #         self.embed_dim,
-            #         self.num_heads,
-            #         torch.empty([0]),
-            #         torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
-            #         self.bias_k,
-            #         self.bias_v,
-            #         self.add_zero_attn,
-            #         self.dropout_module.p,
-            #         self.out_proj.weight,
-            #         self.out_proj.bias,
-            #         self.training or self.dropout_module.apply_during_inference,
-            #         key_padding_mask,
-            #         need_weights,
-            #         attn_mask,
-            #         use_separate_proj_weight=True,
-            #         q_proj_weight=self.q_proj.weight,
-            #         k_proj_weight=self.k_proj.weight,
-            #         v_proj_weight=self.v_proj.weight,
-            #     )
+        # if self.use_xformers:
+        #     return self._xformers_attn_forward(
+        #         query, key, value, key_padding_mask, need_weights, attn_mask
+        #     )
+
+        # else:
+        #     return F.multi_head_attention_forward(
+        #         query,
+        #         key,
+        #         value,
+        #         self.embed_dim,
+        #         self.num_heads,
+        #         torch.empty([0]),
+        #         torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+        #         self.bias_k,
+        #         self.bias_v,
+        #         self.add_zero_attn,
+        #         self.dropout_module.p,
+        #         self.out_proj.weight,
+        #         self.out_proj.bias,
+        #         self.training or self.dropout_module.apply_during_inference,
+        #         key_padding_mask,
+        #         need_weights,
+        #         attn_mask,
+        #         use_separate_proj_weight=True,
+        #         q_proj_weight=self.q_proj.weight,
+        #         k_proj_weight=self.k_proj.weight,
+        #         v_proj_weight=self.v_proj.weight,
+        #     )
 
         if incremental_state is not None:
             saved_state = self._get_input_buffer(incremental_state)
@@ -644,13 +642,13 @@ class MultiheadAttention(nn.Layer):
             else:
                 if self.beam_size > 1 and bsz == key.size(1):
                     # key is [T, bsz*beam_size, C], reduce to [T, bsz, C]
-                    key = key.view(key.size(0), -1, self.beam_size, key.size(2))[
-                        :, :, 0, :
-                    ]
+                    key = key.view(
+                        key.size(0), -1, self.beam_size,
+                        key.size(2))[:, :, 0, :]
                     if key_padding_mask is not None:
                         key_padding_mask = key_padding_mask.view(
-                            -1, self.beam_size, key_padding_mask.size(1)
-                        )[:, 0, :]
+                            -1, self.beam_size,
+                            key_padding_mask.size(1))[:, 0, :]
                 k = self.k_proj(key)
                 v = self.v_proj(key)
 
@@ -664,16 +662,21 @@ class MultiheadAttention(nn.Layer):
         if self.bias_k is not None:
             assert self.bias_v is not None
             k, v, attn_mask, key_padding_mask = self._add_bias(
-                k, v, attn_mask, key_padding_mask, bsz
-            )
+                k, v, attn_mask, key_padding_mask, bsz)
 
-        q = paddle.reshape(q, [tgt_len, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
+        q = paddle.reshape(
+            q, [tgt_len, bsz * self.num_heads, self.head_dim]).transpose(
+                [1, 0, 2])
         kv_bsz = bsz  # need default value for scripting
         if k is not None:
             kv_bsz = k.shape[1]
-            k = paddle.reshape(k, [-1, kv_bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
+            k = paddle.reshape(
+                k, [-1, kv_bsz * self.num_heads, self.head_dim]).transpose(
+                    [1, 0, 2])
         if v is not None:
-            v = paddle.reshape(v, [-1, kv_bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
+            v = paddle.reshape(
+                v, [-1, kv_bsz * self.num_heads, self.head_dim]).transpose(
+                    [1, 0, 2])
 
         if saved_state is not None:
             # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
@@ -681,7 +684,8 @@ class MultiheadAttention(nn.Layer):
                 _prev_key = saved_state["prev_key"]
                 assert _prev_key is not None
                 kv_bsz = _prev_key.shape[0]
-                prev_key = _prev_key.reshape([kv_bsz * self.num_heads, -1, self.head_dim])
+                prev_key = _prev_key.reshape(
+                    [kv_bsz * self.num_heads, -1, self.head_dim])
                 if static_kv:
                     k = prev_key
                 else:
@@ -693,8 +697,7 @@ class MultiheadAttention(nn.Layer):
                 assert _prev_value is not None
                 assert kv_bsz == _prev_value.size(0)
                 prev_value = _prev_value.reshape(
-                    [kv_bsz * self.num_heads, -1, self.head_dim]
-                )
+                    [kv_bsz * self.num_heads, -1, self.head_dim])
                 if static_kv:
                     v = prev_value
                 else:
@@ -709,17 +712,17 @@ class MultiheadAttention(nn.Layer):
                 prev_key_padding_mask=prev_key_padding_mask,
                 batch_size=kv_bsz,
                 src_len=k.shape[1],
-                static_kv=static_kv,
-            )
+                static_kv=static_kv, )
 
-            saved_state["prev_key"] = k.reshape([kv_bsz, self.num_heads, -1, self.head_dim])
+            saved_state["prev_key"] = k.reshape(
+                [kv_bsz, self.num_heads, -1, self.head_dim])
             saved_state["prev_value"] = v.reshape(
-                [kv_bsz, self.num_heads, -1, self.head_dim]
-            )
+                [kv_bsz, self.num_heads, -1, self.head_dim])
             saved_state["prev_key_padding_mask"] = key_padding_mask
             # In this branch incremental_state is never None
             assert incremental_state is not None
-            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+            incremental_state = self._set_input_buffer(incremental_state,
+                                                       saved_state)
         assert k is not None
         assert k.shape[1] == src_len
 
@@ -736,21 +739,26 @@ class MultiheadAttention(nn.Layer):
             assert v is not None
             src_len += 1
             k, v, key_padding_mask, attn_mask = self._append_zero_attn(
-                k=k, v=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask
-            )
+                k=k,
+                v=v,
+                key_padding_mask=key_padding_mask,
+                attn_mask=attn_mask)
 
         if self.encoder_decoder_attention and bsz != kv_bsz:
             attn_weights = paddle.einsum(
                 "bxhtd,bhsd->bxhts",
                 q.reshape([kv_bsz, -1, self.num_heads] + q.shape[1:]),
-                k.reshape([kv_bsz, self.num_heads] + k.shape[1:]),
-            )
-            attn_weights = attn_weights.reshape([-1,] + attn_weights.shape[-2:])
+                k.reshape([kv_bsz, self.num_heads] + k.shape[1:]), )
+            attn_weights = attn_weights.reshape([
+                -1,
+            ] + attn_weights.shape[-2:])
         else:
             attn_weights = paddle.bmm(q, k.transpose([0, 2, 1]))
-        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len,
+                                              bsz)
 
-        assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
+        assert list(
+            attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
 
         if attn_mask is not None:
             attn_mask = attn_mask.unsqueeze(0)
@@ -760,37 +768,37 @@ class MultiheadAttention(nn.Layer):
 
         if key_padding_mask is not None:
             # don't attend to padding symbols
-            attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
+            attn_weights = attn_weights.reshape(
+                [bsz, self.num_heads, tgt_len, src_len])
             if not is_tpu:
                 attn_weights = attn_weights.reshape(
-                    [kv_bsz, -1, self.num_heads, tgt_len, src_len]
-                )
+                    [kv_bsz, -1, self.num_heads, tgt_len, src_len])
                 attn_weights = paddle.where(
-                    key_padding_mask.unsqueeze(1)
-                    .unsqueeze(2)
-                    .unsqueeze(3)
+                    key_padding_mask.unsqueeze(1).unsqueeze(2).unsqueeze(3)
                     .astype('bool'),
                     float('-inf') * paddle.ones_like(attn_weights),
-                    attn_weights
-                )
+                    attn_weights)
             else:
                 attn_weights = attn_weights.transpose([2, 1, 0])
-                attn_weights = paddle.where(key_padding_mask, float('-inf') * paddle.ones_like(attn_weights), attn_weights)
+                attn_weights = paddle.where(key_padding_mask,
+                                            float('-inf') *
+                                            paddle.ones_like(attn_weights),
+                                            attn_weights)
                 attn_weights = attn_weights.transpose([2, 1, 0])
-            attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
+            attn_weights = attn_weights.reshape(
+                [bsz * self.num_heads, tgt_len, src_len])
 
         if before_softmax:
             return attn_weights, v
 
-        def softmax_supporting_onnx_trace(x, dim: int, onnx_trace: bool = False):
+        def softmax_supporting_onnx_trace(x, dim: int, onnx_trace: bool=False):
             if onnx_trace:
                 return F.softmax(x, axis=dim)
             else:
                 return F.softmax(x, axis=dim, dtype='float32')
 
         attn_weights_float = softmax_supporting_onnx_trace(
-            attn_weights, dim=-1, onnx_trace=self.onnx_trace
-        )
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace)
         attn_weights = paddle.cast(attn_weights_float, attn_weights.dtype)
         attn_probs = self.dropout_module(attn_weights)
 
@@ -798,34 +806,28 @@ class MultiheadAttention(nn.Layer):
         if self.encoder_decoder_attention and bsz != kv_bsz:
             attn = paddle.einsum(
                 "bxhts,bhsd->bxhtd",
-                attn_probs.reshape(
-                        [kv_bsz,
-                        -1,
-                        self.num_heads]
-                    + attn_probs.shape[1:]
-                ),
-                v.reshape(
-                        [kv_bsz,
-                        self.num_heads]
-                    + v.shape[1:]
-                ),
-            )
-            attn = attn.reshape([-1,] + attn.shape[-2:])
+                attn_probs.reshape([kv_bsz, -1, self.num_heads] +
+                                   attn_probs.shape[1:]),
+                v.reshape([kv_bsz, self.num_heads] + v.shape[1:]), )
+            attn = attn.reshape([
+                -1,
+            ] + attn.shape[-2:])
         else:
             attn = paddle.bmm(attn_probs, v)
-        assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        assert list(
+            attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
         if self.onnx_trace and attn.shape[1] == 1:
             # when ONNX tracing a single decoder step (sequence length == 1)
             # the transpose is a no-op copy before view, thus unnecessary
             attn = attn.reshape([tgt_len, bsz, self.embed_dim])
         else:
-            attn = attn.transpose([1, 0, 2]).reshape([tgt_len, bsz, self.embed_dim])
+            attn = attn.transpose([1, 0, 2]).reshape(
+                [tgt_len, bsz, self.embed_dim])
         attn = self.out_proj(attn)
         attn_weights: Optional[Tensor] = None
         if need_weights:
             attn_weights = attn_weights_float.reshape(
-                [bsz, self.num_heads, tgt_len, src_len]
-            ).transpose([1, 0, 2, 3])
+                [bsz, self.num_heads, tgt_len, src_len]).transpose([1, 0, 2, 3])
             if not need_head_weights:
                 # average attention weights over heads
                 attn_weights = attn_weights.mean(axis=0)
@@ -834,52 +836,51 @@ class MultiheadAttention(nn.Layer):
 
     @staticmethod
     def _append_prev_key_padding_mask(
-        key_padding_mask: Optional[Tensor],
-        prev_key_padding_mask: Optional[Tensor],
-        batch_size: int,
-        src_len: int,
-        static_kv: bool,
-    ) -> Optional[Tensor]:
+            key_padding_mask: Optional[Tensor],
+            prev_key_padding_mask: Optional[Tensor],
+            batch_size: int,
+            src_len: int,
+            static_kv: bool, ) -> Optional[Tensor]:
         # saved key padding masks have shape (bsz, seq_len)
         if prev_key_padding_mask is not None and static_kv:
             new_key_padding_mask = prev_key_padding_mask
         elif prev_key_padding_mask is not None and key_padding_mask is not None:
-            new_key_padding_mask = paddle.concat(
-                [paddle.cast(prev_key_padding_mask, 'float32'), paddle.cast(key_padding_mask, 'float32')], axis==1
-            )
+            new_key_padding_mask = paddle.concat([
+                paddle.cast(prev_key_padding_mask, 'float32'),
+                paddle.cast(key_padding_mask, 'float32')
+            ], axis == 1)
         # During incremental decoding, as the padding token enters and
         # leaves the frame, there will be a time when prev or current
         # is None
         elif prev_key_padding_mask is not None:
             if src_len > prev_key_padding_mask.shape[1]:
                 filler = paddle.zeros(
-                    [batch_size, src_len - prev_key_padding_mask.shape[1]],
-                )
-                new_key_padding_mask = paddle.concat(
-                    [paddle.cast(prev_key_padding_mask, 'float32'), paddle.cast(filler, 'float32')], axis==1
-                )
+                    [batch_size, src_len - prev_key_padding_mask.shape[1]], )
+                new_key_padding_mask = paddle.concat([
+                    paddle.cast(prev_key_padding_mask, 'float32'),
+                    paddle.cast(filler, 'float32')
+                ], axis == 1)
             else:
                 new_key_padding_mask = prev_key_padding_mask
         elif key_padding_mask is not None:
             if src_len > key_padding_mask.shape[1]:
                 filler = paddle.zeros(
-                    [batch_size, src_len - key_padding_mask.shape[1]],
-                )
-                new_key_padding_mask = paddle.concat(
-                    [paddle.cast(filler,'float32'), paddle.cast(key_padding_mask,'float32')], axis==1
-                )
+                    [batch_size, src_len - key_padding_mask.shape[1]], )
+                new_key_padding_mask = paddle.concat([
+                    paddle.cast(filler, 'float32'),
+                    paddle.cast(key_padding_mask, 'float32')
+                ], axis == 1)
             else:
-                new_key_padding_mask = paddle.cast(key_padding_mask,'float32')
+                new_key_padding_mask = paddle.cast(key_padding_mask, 'float32')
         else:
             new_key_padding_mask = prev_key_padding_mask
         return new_key_padding_mask
 
     @paddle.jit.to_static
     def reorder_incremental_state(
-        self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        new_order: Tensor,
-    ):
+            self,
+            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+            new_order: Tensor, ):
         """Reorder buffered internal state (for incremental generation)."""
         input_buffer = self._get_input_buffer(incremental_state)
         if input_buffer is not None:
@@ -887,19 +888,24 @@ class MultiheadAttention(nn.Layer):
                 input_buffer_k = input_buffer[k]
                 if input_buffer_k is not None:
                     if self.encoder_decoder_attention:
-                        if input_buffer_k.shape[0] * self.beam_size == new_order.shape[0]:
+                        if input_buffer_k.shape[
+                                0] * self.beam_size == new_order.shape[0]:
                             return incremental_state
                         elif self.beam_size > 1:
                             input_buffer[k] = paddle.index_select(
                                 input_buffer_k,
-                                index=new_order.reshape([-1, self.beam_size])[:, 0] // self.beam_size,
-                                axis=0,
-                            )
+                                index=new_order.reshape(
+                                    [-1, self.beam_size])[:, 0] //
+                                self.beam_size,
+                                axis=0, )
                         else:
-                            input_buffer[k] = paddle.index_select(input_buffer_k, index=new_order, axis=0)
+                            input_buffer[k] = paddle.index_select(
+                                input_buffer_k, index=new_order, axis=0)
                     else:
-                        input_buffer[k] = paddle.index_select(input_buffer_k, index=new_order, axis=0)
-            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+                        input_buffer[k] = paddle.index_select(
+                            input_buffer_k, index=new_order, axis=0)
+            incremental_state = self._set_input_buffer(incremental_state,
+                                                       input_buffer)
         return incremental_state
 
     def set_beam_size(self, beam_size):
@@ -907,7 +913,8 @@ class MultiheadAttention(nn.Layer):
         self.beam_size = beam_size
 
     def _get_input_buffer(
-        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
     ) -> Dict[str, Optional[Tensor]]:
         result = self.get_incremental_state(incremental_state, "attn_state")
         if result is not None:
@@ -917,13 +924,17 @@ class MultiheadAttention(nn.Layer):
             return empty_result
 
     def _set_input_buffer(
-        self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        buffer: Dict[str, Optional[Tensor]],
-    ):
-        return self.set_incremental_state(incremental_state, "attn_state", buffer)
-
-    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
+            self,
+            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+            buffer: Dict[str, Optional[Tensor]], ):
+        return self.set_incremental_state(incremental_state, "attn_state",
+                                          buffer)
+
+    def apply_sparse_mask(self,
+                          attn_weights,
+                          tgt_len: int,
+                          src_len: int,
+                          bsz: int):
         return attn_weights
 
     def upgrade_state_dict_named(self, state_dict, name):
@@ -935,19 +946,21 @@ class MultiheadAttention(nn.Layer):
                 # in_proj_weight used to be q + k + v with same dimensions
                 dim = int(state_dict[k].shape[0] / 3)
                 items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
-                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
-                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
+                items_to_add[prefix +
+                             "k_proj.weight"] = state_dict[k][dim:2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim:]
 
                 keys_to_remove.append(k)
 
                 k_bias = prefix + "in_proj_bias"
                 if k_bias in state_dict.keys():
                     dim = int(state_dict[k].shape[0] / 3)
-                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix +
+                                 "q_proj.bias"] = state_dict[k_bias][:dim]
                     items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
-                        dim : 2 * dim
-                    ]
-                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
+                        dim:2 * dim]
+                    items_to_add[prefix +
+                                 "v_proj.bias"] = state_dict[k_bias][2 * dim:]
 
                     keys_to_remove.append(prefix + "in_proj_bias")
 
@@ -957,20 +970,20 @@ class MultiheadAttention(nn.Layer):
         for key, value in items_to_add.items():
             state_dict[key] = value
 
+
 class GumbelVectorQuantizer(nn.Layer):
     def __init__(
-        self,
-        dim,
-        num_vars,
-        temp,
-        groups,
-        combine_groups,
-        vq_dim,
-        time_first,
-        activation=nn.GELU(),
-        weight_proj_depth=1,
-        weight_proj_factor=1,
-    ):
+            self,
+            dim,
+            num_vars,
+            temp,
+            groups,
+            combine_groups,
+            vq_dim,
+            time_first,
+            activation=nn.GELU(),
+            weight_proj_depth=1,
+            weight_proj_factor=1, ):
         """Vector quantization using gumbel softmax
 
         Args:
@@ -1001,13 +1014,15 @@ class GumbelVectorQuantizer(nn.Layer):
         var_dim = vq_dim // groups
         num_groups = groups if not combine_groups else 1
 
-        self.vars = self.create_parameter((1, num_groups * num_vars, var_dim), default_initializer=nn.initializer.Uniform())
-
+        self.vars = self.create_parameter(
+            (1, num_groups * num_vars, var_dim),
+            default_initializer=nn.initializer.Uniform())
 
         if weight_proj_depth > 1:
 
             def block(input_dim, output_dim):
-                return nn.Sequential(nn.Linear(input_dim, output_dim), activation)
+                return nn.Sequential(
+                    nn.Linear(input_dim, output_dim), activation)
 
             inner_dim = self.input_dim * weight_proj_factor
             self.weight_proj = nn.Sequential(
@@ -1015,8 +1030,7 @@ class GumbelVectorQuantizer(nn.Layer):
                     block(self.input_dim if i == 0 else inner_dim, inner_dim)
                     for i in range(weight_proj_depth - 1)
                 ],
-                nn.Linear(inner_dim, groups * num_vars),
-            )
+                nn.Linear(inner_dim, groups * num_vars), )
         else:
             self.weight_proj = nn.Linear(self.input_dim, groups * num_vars)
             nn.initializer.Normal(mean=0, std=1)(self.weight_proj.weight)
@@ -1033,9 +1047,8 @@ class GumbelVectorQuantizer(nn.Layer):
         self.codebook_indices = None
 
     def set_num_updates(self, num_updates):
-        self.curr_temp = max(
-            self.max_temp * self.temp_decay**num_updates, self.min_temp
-        )
+        self.curr_temp = max(self.max_temp * self.temp_decay**num_updates,
+                             self.min_temp)
 
     def get_codebook_indices(self):
         if self.codebook_indices is None:
@@ -1044,13 +1057,11 @@ class GumbelVectorQuantizer(nn.Layer):
             p = [range(self.num_vars)] * self.groups
             inds = list(product(*p))
             self.codebook_indices = paddle.to_tensor(
-                inds, dtype='int64', place=self.vars.place
-            ).flatten()
+                inds, dtype='int64', place=self.vars.place).flatten()
 
             if not self.combine_groups:
                 self.codebook_indices = self.codebook_indices.reshape(
-                    self.num_vars**self.groups, -1
-                )
+                    self.num_vars**self.groups, -1)
                 for b in range(1, self.groups):
                     self.codebook_indices[:, b] += self.num_vars * b
                 self.codebook_indices = self.codebook_indices.flatten()
@@ -1058,23 +1069,20 @@ class GumbelVectorQuantizer(nn.Layer):
 
     def codebook(self):
         indices = self.get_codebook_indices()
-        return (
-            self.vars.squeeze(0)
-            .index_select(0, indices)
-            .reshape(self.num_vars**self.groups, -1)
-        )
+        return (self.vars.squeeze(0).index_select(0, indices)
+                .reshape(self.num_vars**self.groups, -1))
 
     def sample_from_codebook(self, b, n):
         indices = self.get_codebook_indices()
         indices = indices.reshape(-1, self.groups)
         cb_size = indices.shape[0]
-        assert (
-            n < cb_size
-        ), f"sample size {n} is greater than size of codebook {cb_size}"
-        sample_idx = paddle.randint(low=0, high=cb_size, shape=(b * n,))
+        assert (n < cb_size
+                ), f"sample size {n} is greater than size of codebook {cb_size}"
+        sample_idx = paddle.randint(low=0, high=cb_size, shape=(b * n, ))
         indices = indices[sample_idx]
 
-        z = self.vars.squeeze(0).index_select(0, indices.flatten()).reshape(b, n, -1)
+        z = self.vars.squeeze(0).index_select(0, indices.flatten()).reshape(
+            b, n, -1)
         return z
 
     def to_codebook_index(self, indices):
@@ -1104,24 +1112,24 @@ class GumbelVectorQuantizer(nn.Layer):
         hard_x.scatter_(-1, k.reshape([-1, 1]), 1.0)
         hard_x = hard_x.reshape([bsz * tsz, self.groups, -1])
         hard_probs = paddle.mean(hard_x.astype('float32'), axis=0)
-        result["code_perplexity"] = paddle.exp(
-            -paddle.sum(hard_probs * paddle.log(hard_probs + 1e-7), axis=-1)
-        ).sum()
-
-        avg_probs = F.softmax(x.reshape([bsz * tsz, self.groups, -1]).astype('float32'), axis=-1).mean(axis=0)
-        result["prob_perplexity"] = paddle.exp(
-            -paddle.sum(avg_probs * paddle.log(avg_probs + 1e-7), axis=-1)
-        ).sum()
+        result["code_perplexity"] = paddle.exp(-paddle.sum(
+            hard_probs * paddle.log(hard_probs + 1e-7), axis=-1)).sum()
 
+        avg_probs = F.softmax(
+            x.reshape([bsz * tsz, self.groups, -1]).astype('float32'),
+            axis=-1).mean(axis=0)
+        result["prob_perplexity"] = paddle.exp(-paddle.sum(
+            avg_probs * paddle.log(avg_probs + 1e-7), axis=-1)).sum()
 
         result["temp"] = self.curr_temp
 
         if self.training:
-            x = F.gumbel_softmax(x.astype('float32'), tau=self.curr_temp, hard=True).astype(x.dtype)
+            x = F.gumbel_softmax(
+                x.astype('float32'), tau=self.curr_temp,
+                hard=True).astype(x.dtype)
         else:
             x = hard_x
 
-
         x = x.reshape([bsz * tsz, -1])
 
         vars = self.vars
@@ -1129,12 +1137,9 @@ class GumbelVectorQuantizer(nn.Layer):
             vars = vars.tile([1, self.groups, 1])
 
         if produce_targets:
-            result["targets"] = (
-                x.reshape([bsz * tsz * self.groups, -1])
-                .argmax(axis=-1)
-                .reshape([bsz, tsz, self.groups])
-                .detach()
-            )
+            result["targets"] = (x.reshape([bsz * tsz * self.groups, -1])
+                                 .argmax(axis=-1)
+                                 .reshape([bsz, tsz, self.groups]).detach())
 
         x = x.unsqueeze(-1) * vars
         x = x.reshape([bsz * tsz, self.groups, self.num_vars, -1])
@@ -1148,6 +1153,7 @@ class GumbelVectorQuantizer(nn.Layer):
 
         return result
 
+
 class GradMultiply(paddle.autograd.PyLayer):
     @staticmethod
     def forward(ctx, x, scale):
@@ -1170,7 +1176,7 @@ class SamePad(nn.Layer):
 
     def forward(self, x):
         if self.remove > 0:
-            x = x[:, :, : -self.remove]
+            x = x[:, :, :-self.remove]
         return x
 
 
@@ -1188,7 +1194,11 @@ class TransposeLast(nn.Layer):
 
 
 def LayerNorm(normalized_shape, eps=1e-5):
-    return nn.LayerNorm(normalized_shape, epsilon=eps, weight_attr=paddle.ParamAttr(), bias_attr=paddle.ParamAttr())  
+    return nn.LayerNorm(
+        normalized_shape,
+        epsilon=eps,
+        weight_attr=paddle.ParamAttr(),
+        bias_attr=paddle.ParamAttr())
 
 
 class Fp32LayerNorm(nn.LayerNorm):
@@ -1203,13 +1213,14 @@ class Fp32LayerNorm(nn.LayerNorm):
             self._normalized_shape,
             self.weight.astype('float32') if self.weight is not None else None,
             self.bias.astype('float32') if self.bias is not None else None,
-            self._epsilon,
-        )
+            self._epsilon, )
         return output.astype(input.dtype)
 
+
 class Fp32GroupNorm(nn.GroupNorm):
     def __init__(self, *args, **kwargs):
-        super().__init__( *args, **kwargs)
+        super().__init__(*args, **kwargs)
+
     def forward(self, input):
         # import pdb
         # pdb.set_trace()
@@ -1218,8 +1229,7 @@ class Fp32GroupNorm(nn.GroupNorm):
             self._num_groups,
             self.weight.astype('float32') if self.weight is not None else None,
             self.bias.astype('float32') if self.bias is not None else None,
-            self._epsilon,
-        )
+            self._epsilon, )
         return output.astype(input.dtype)
 
 
@@ -1260,11 +1270,8 @@ def get_activation_fn(activation: str) -> Callable:
     def gelu_accurate(x):
         if not hasattr(gelu_accurate, "_a"):
             gelu_accurate._a = math.sqrt(2 / math.pi)
-        return (
-            0.5
-            * x
-            * (1 + paddle.tanh(gelu_accurate._a * (x + 0.044715 * paddle.pow(x, 3))))
-        )
+        return (0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
+                                           (x + 0.044715 * paddle.pow(x, 3)))))
 
     def gelu(x: paddle.Tensor) -> paddle.Tensor:
         return paddle.nn.functional.gelu(x.astype('float32')).astype(x.dtype)
@@ -1286,7 +1293,8 @@ def get_activation_fn(activation: str) -> Callable:
     elif activation == "swish":
         return paddle.nn.Swish
     else:
-        raise RuntimeError("--activation-fn {} not supported".format(activation))
+        raise RuntimeError(
+            "--activation-fn {} not supported".format(activation))
 
 
 def get_available_activation_fns() -> List:
@@ -1301,18 +1309,17 @@ def get_available_activation_fns() -> List:
 
 
 def compute_mask_indices(
-    shape: Tuple[int, int],
-    padding_mask: Optional[paddle.Tensor],
-    mask_prob: float,
-    mask_length: int,
-    mask_type: str = "static",
-    mask_other: float = 0.0,
-    min_masks: int = 0,
-    no_overlap: bool = False,
-    min_space: int = 0,
-    require_same_masks: bool = True,
-    mask_dropout: float = 0.0,
-) -> np.ndarray:
+        shape: Tuple[int, int],
+        padding_mask: Optional[paddle.Tensor],
+        mask_prob: float,
+        mask_length: int,
+        mask_type: str="static",
+        mask_other: float=0.0,
+        min_masks: int=0,
+        no_overlap: bool=False,
+        min_space: int=0,
+        require_same_masks: bool=True,
+        mask_dropout: float=0.0, ) -> np.ndarray:
     """
     Computes random mask spans for a given shape
 
@@ -1340,9 +1347,7 @@ def compute_mask_indices(
 
     all_num_mask = int(
         # add a random number for probabilistic rounding
-        mask_prob * all_sz / float(mask_length)
-        + np.random.rand()
-    )
+        mask_prob * all_sz / float(mask_length) + np.random.rand())
 
     all_num_mask = max(min_masks, all_num_mask)
 
@@ -1352,9 +1357,7 @@ def compute_mask_indices(
             sz = all_sz - padding_mask[i].long().sum().item()
             num_mask = int(
                 # add a random number for probabilistic rounding
-                mask_prob * sz / float(mask_length)
-                + np.random.rand()
-            )
+                mask_prob * sz / float(mask_length) + np.random.rand())
             num_mask = max(min_masks, num_mask)
         else:
             sz = all_sz
@@ -1363,7 +1366,8 @@ def compute_mask_indices(
         if mask_type == "static":
             lengths = np.full(num_mask, mask_length)
         elif mask_type == "uniform":
-            lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+            lengths = np.random.randint(
+                mask_other, mask_length * 2 + 1, size=num_mask)
         elif mask_type == "normal":
             lengths = np.random.normal(mask_length, mask_other, size=num_mask)
             lengths = [max(1, int(round(x))) for x in lengths]
@@ -1394,9 +1398,9 @@ def compute_mask_indices(
             min_length = min(lengths)
             for length in sorted(lengths, reverse=True):
                 lens = np.fromiter(
-                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
-                    np.int,
-                )
+                    (e - s if e - s >= length + min_space else 0
+                     for s, e in parts),
+                    np.int, )
                 l_sum = np.sum(lens)
                 if l_sum == 0:
                     break
@@ -1412,13 +1416,10 @@ def compute_mask_indices(
 
             mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
 
-            mask_idc = np.asarray(
-                [
-                    mask_idc[j] + offset
-                    for j in range(len(mask_idc))
-                    for offset in range(lengths[j])
-                ]
-            )
+            mask_idc = np.asarray([
+                mask_idc[j] + offset
+                for j in range(len(mask_idc)) for offset in range(lengths[j])
+            ])
 
         mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
 
@@ -1429,8 +1430,7 @@ def compute_mask_indices(
         if mask_dropout > 0:
             num_holes = np.rint(len(mask_idc) * mask_dropout).astype(int)
             mask_idc = np.random.choice(
-                mask_idc, len(mask_idc) - num_holes, replace=False
-            )
+                mask_idc, len(mask_idc) - num_holes, replace=False)
 
         mask[i, mask_idc] = True
 
@@ -1460,12 +1460,17 @@ def pad_to_multiple(x, multiple, dim=-1, value=0):
     remainder = math.ceil(m) * multiple - tsz
     if m.is_integer():
         return x, 0
-    pad_offset = (0,) * (-1 - dim) * 2
-    return F.pad(x, pad=[*pad_offset, 0, remainder, *pad_offset], value=value, data_format='NLC'), remainder
+    pad_offset = (0, ) * (-1 - dim) * 2
+    return F.pad(
+        x,
+        pad=[*pad_offset, 0, remainder, *pad_offset],
+        value=value,
+        data_format='NLC'), remainder
 
 
 EXTRACTOR_MODE_CHOICES = ChoiceEnum(["default", "layer_norm"])
-MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(["static", "uniform", "normal", "poisson"])
+MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(
+    ["static", "uniform", "normal", "poisson"])
 LAYER_TYPE_CHOICES = ChoiceEnum(["transformer"])  # ToDo: conformer 
 
 
@@ -1474,46 +1479,39 @@ class Wav2Vec2Config:
     extractor_mode: EXTRACTOR_MODE_CHOICES = field(
         default="default",
         metadata={
-            "help": "mode for feature extractor. default has a single group norm with d "
+            "help":
+            "mode for feature extractor. default has a single group norm with d "
             "groups in the first conv block, whereas layer_norm has layer norms in "
             "every block (meant to use with normalize=True)"
-        },
-    )
+        }, )
     encoder_layers: int = field(
-        default=12, metadata={"help": "num encoder layers in the transformer"}
-    )
+        default=12, metadata={"help": "num encoder layers in the transformer"})
     encoder_embed_dim: int = field(
-        default=768, metadata={"help": "encoder embedding dimension"}
-    )
+        default=768, metadata={"help": "encoder embedding dimension"})
     encoder_ffn_embed_dim: int = field(
-        default=3072, metadata={"help": "encoder embedding dimension for FFN"}
-    )
+        default=3072, metadata={"help": "encoder embedding dimension for FFN"})
     encoder_attention_heads: int = field(
-        default=12, metadata={"help": "num encoder attention heads"}
-    )
+        default=12, metadata={"help": "num encoder attention heads"})
     activation_fn: ChoiceEnum(get_available_activation_fns()) = field(
-        default="gelu", metadata={"help": "activation function to use"}
-    )
+        default="gelu", metadata={"help": "activation function to use"})
     layer_type: LAYER_TYPE_CHOICES = field(
-        default="transformer", metadata={"help": "layer type in encoder"}
-    )
+        default="transformer", metadata={"help": "layer type in encoder"})
     # dropouts
     dropout: float = field(
-        default=0.1, metadata={"help": "dropout probability for the transformer"}
-    )
+        default=0.1,
+        metadata={"help": "dropout probability for the transformer"})
     attention_dropout: float = field(
-        default=0.1, metadata={"help": "dropout probability for attention weights"}
-    )
+        default=0.1,
+        metadata={"help": "dropout probability for attention weights"})
     activation_dropout: float = field(
-        default=0.0, metadata={"help": "dropout probability after activation in FFN"}
-    )
+        default=0.0,
+        metadata={"help": "dropout probability after activation in FFN"})
     encoder_layerdrop: float = field(
-        default=0.0, metadata={"help": "probability of dropping a tarnsformer layer"}
-    )
+        default=0.0,
+        metadata={"help": "probability of dropping a tarnsformer layer"})
     dropout_input: float = field(
         default=0.0,
-        metadata={"help": "dropout to apply to the input (after feat extr)"},
-    )
+        metadata={"help": "dropout to apply to the input (after feat extr)"}, )
     dropout_features: float = field(
         default=0.0,
         metadata={"help": "dropout to apply to the features (after feat extr)"},
@@ -1522,85 +1520,79 @@ class Wav2Vec2Config:
     final_dim: int = field(
         default=0,
         metadata={
-            "help": "project final representations and targets to this many dimensions."
+            "help":
+            "project final representations and targets to this many dimensions."
             "set to encoder_embed_dim is <= 0"
-        },
-    )
+        }, )
     layer_norm_first: bool = field(
-        default=False, metadata={"help": "apply layernorm first in the transformer"}
-    )
+        default=False,
+        metadata={"help": "apply layernorm first in the transformer"})
     conv_feature_layers: str = field(
         default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
         metadata={
-            "help": "string describing convolutional feature extraction layers in form of a python list that contains "
+            "help":
+            "string describing convolutional feature extraction layers in form of a python list that contains "
             "[(dim, kernel_size, stride), ...]"
-        },
-    )
+        }, )
     conv_bias: bool = field(
-        default=False, metadata={"help": "include bias in conv encoder"}
-    )
+        default=False, metadata={"help": "include bias in conv encoder"})
     logit_temp: float = field(
-        default=0.1, metadata={"help": "temperature to divide logits by"}
-    )
+        default=0.1, metadata={"help": "temperature to divide logits by"})
     quantize_targets: bool = field(
-        default=False, metadata={"help": "use quantized targets"}
-    )
+        default=False, metadata={"help": "use quantized targets"})
     quantize_input: bool = field(
-        default=False, metadata={"help": "use quantized inputs"}
-    )
+        default=False, metadata={"help": "use quantized inputs"})
     same_quantizer: bool = field(
-        default=False, metadata={"help": "use same quantizer for inputs and targets"}
-    )
+        default=False,
+        metadata={"help": "use same quantizer for inputs and targets"})
     target_glu: bool = field(
-        default=False, metadata={"help": "adds projection + glu to targets"}
-    )
+        default=False, metadata={"help": "adds projection + glu to targets"})
     feature_grad_mult: float = field(
-        default=1.0, metadata={"help": "multiply feature extractor var grads by this"}
-    )
+        default=1.0,
+        metadata={"help": "multiply feature extractor var grads by this"})
     quantizer_depth: int = field(
         default=1,
-        metadata={"help": "number of quantizer layers"},
-    )
+        metadata={"help": "number of quantizer layers"}, )
     quantizer_factor: int = field(
         default=3,
         metadata={
-            "help": "dimensionality increase for inner quantizer layers (if depth > 1)"
-        },
-    )
+            "help":
+            "dimensionality increase for inner quantizer layers (if depth > 1)"
+        }, )
     latent_vars: int = field(
         default=320,
-        metadata={"help": "number of latent variables V in each group of the codebook"},
-    )
+        metadata={
+            "help": "number of latent variables V in each group of the codebook"
+        }, )
     latent_groups: int = field(
         default=2,
-        metadata={"help": "number of groups G of latent variables in the codebook"},
-    )
+        metadata={
+            "help": "number of groups G of latent variables in the codebook"
+        }, )
     latent_dim: int = field(
         default=0,
         metadata={
-            "help": "if > 0, uses this dimensionality for latent variables. "
+            "help":
+            "if > 0, uses this dimensionality for latent variables. "
             "otherwise uses final_dim / latent_groups"
-        },
-    )
+        }, )
 
     # masking
     mask_length: int = field(default=10, metadata={"help": "mask length"})
     mask_prob: float = field(
-        default=0.65, metadata={"help": "probability of replacing a token with mask"}
-    )
+        default=0.65,
+        metadata={"help": "probability of replacing a token with mask"})
     mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
-        default="static", metadata={"help": "how to choose mask length"}
-    )
+        default="static", metadata={"help": "how to choose mask length"})
     mask_other: float = field(
         default=0,
         metadata={
-            "help": "secondary mask argument (used for more complex distributions), "
+            "help":
+            "secondary mask argument (used for more complex distributions), "
             "see help in compute_mask_indices"
-        },
-    )
+        }, )
     no_mask_overlap: bool = field(
-        default=False, metadata={"help": "whether to allow masks to overlap"}
-    )
+        default=False, metadata={"help": "whether to allow masks to overlap"})
     mask_min_space: int = field(
         default=1,
         metadata={"help": "min space between spans (if no overlap is enabled)"},
@@ -1608,37 +1600,35 @@ class Wav2Vec2Config:
     require_same_masks: bool = field(
         default=True,
         metadata={
-            "help": "whether to number of masked timesteps must be the same across all "
+            "help":
+            "whether to number of masked timesteps must be the same across all "
             "examples in a batch"
-        },
-    )
+        }, )
     mask_dropout: float = field(
         default=0.0,
-        metadata={"help": "percent of masks to unmask for each sample"},
-    )
+        metadata={"help": "percent of masks to unmask for each sample"}, )
 
     # channel masking
     mask_channel_length: int = field(
-        default=10, metadata={"help": "length of the mask for features (channels)"}
-    )
+        default=10,
+        metadata={"help": "length of the mask for features (channels)"})
     mask_channel_prob: float = field(
-        default=0.0, metadata={"help": "probability of replacing a feature with 0"}
-    )
+        default=0.0,
+        metadata={"help": "probability of replacing a feature with 0"})
     mask_channel_before: bool = False
     mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
         default="static",
-        metadata={"help": "how to choose mask length for channel masking"},
-    )
+        metadata={"help": "how to choose mask length for channel masking"}, )
     mask_channel_other: float = field(
         default=0,
         metadata={
-            "help": "secondary mask argument (used for more complex distributions), "
+            "help":
+            "secondary mask argument (used for more complex distributions), "
             "see help in compute_mask_indicesh"
-        },
-    )
+        }, )
     no_mask_channel_overlap: bool = field(
-        default=False, metadata={"help": "whether to allow channel masks to overlap"}
-    )
+        default=False,
+        metadata={"help": "whether to allow channel masks to overlap"})
     mask_channel_min_space: int = field(
         default=1,
         metadata={"help": "min space between spans (if no overlap is enabled)"},
@@ -1647,76 +1637,77 @@ class Wav2Vec2Config:
     # negative selection
     num_negatives: int = field(
         default=100,
-        metadata={"help": "number of negative examples from the same sample"},
-    )
+        metadata={"help": "number of negative examples from the same sample"}, )
     negatives_from_everywhere: bool = field(
         default=False,
-        metadata={"help": "sample negatives from everywhere, not just masked states"},
-    )
+        metadata={
+            "help": "sample negatives from everywhere, not just masked states"
+        }, )
     cross_sample_negatives: int = field(
-        default=0, metadata={"help": "number of negative examples from the any sample"}
-    )
+        default=0,
+        metadata={"help": "number of negative examples from the any sample"})
     codebook_negatives: int = field(
-        default=0, metadata={"help": "number of negative examples codebook"}
-    )
+        default=0, metadata={"help": "number of negative examples codebook"})
 
     # positional embeddings
     conv_pos: int = field(
         default=128,
-        metadata={"help": "number of filters for convolutional positional embeddings"},
-    )
+        metadata={
+            "help": "number of filters for convolutional positional embeddings"
+        }, )
     conv_pos_groups: int = field(
         default=16,
-        metadata={"help": "number of groups for convolutional positional embedding"},
-    )
+        metadata={
+            "help": "number of groups for convolutional positional embedding"
+        }, )
     pos_conv_depth: int = field(
         default=1,
-        metadata={"help": "depth of positional encoder network"},
-    )
+        metadata={"help": "depth of positional encoder network"}, )
 
     latent_temp: Tuple[float, float, float] = field(
         default=(2, 0.5, 0.999995),
         metadata={
-            "help": "temperature for latent variable sampling. "
+            "help":
+            "temperature for latent variable sampling. "
             "can be tuple of 3 values (start, end, decay)"
-        },
-    )
-    max_positions: int = field(default=100000, metadata={"help": "Max positions"})
+        }, )
+    max_positions: int = field(
+        default=100000, metadata={"help": "Max positions"})
     checkpoint_activations: bool = field(
         default=False,
-        metadata={"help": "recompute activations and save memory for extra compute"},
-    )
+        metadata={
+            "help": "recompute activations and save memory for extra compute"
+        }, )
 
     # FP16 optimization
     required_seq_len_multiple: int = field(
         default=2,
         metadata={
-            "help": "pad the input to encoder such that the sequence length is divisible by multiple"
-        },
-    )
+            "help":
+            "pad the input to encoder such that the sequence length is divisible by multiple"
+        }, )
     crop_seq_to_multiple: int = field(
         default=1,
         metadata={
-            "help": "crop convolutional feature extractor output such that the sequence length is divisible by multiple"
-        },
-    )
+            "help":
+            "crop convolutional feature extractor output such that the sequence length is divisible by multiple"
+        }, )
 
     # Conformer
     depthwise_conv_kernel_size: int = field(
         default=31,
         metadata={
-            "help": "depthwise-conv-kernel-size for convolution in conformer layer"
-        },
-    )
+            "help":
+            "depthwise-conv-kernel-size for convolution in conformer layer"
+        }, )
     attn_type: str = field(
         default="",
-        metadata={"help": "if espnet use ESPNET MHA"},
-    )
+        metadata={"help": "if espnet use ESPNET MHA"}, )
     pos_enc_type: str = field(
         default="abs",
-        metadata={"help": "Positional encoding type to use in conformer"},
-    )
-    fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"})
+        metadata={"help": "Positional encoding type to use in conformer"}, )
+    fp16: bool = field(
+        default=False, metadata={"help": "If fp16 is being used"})
 
 
 class Wav2Vec2Model(nn.Layer):
@@ -1731,14 +1722,11 @@ class Wav2Vec2Model(nn.Layer):
             conv_layers=feature_enc_layers,
             dropout=0.0,
             mode=cfg.extractor_mode,
-            conv_bias=cfg.conv_bias,
-        )
+            conv_bias=cfg.conv_bias, )
 
-        self.post_extract_proj = (
-            nn.Linear(self.embed, cfg.encoder_embed_dim)
-            if self.embed != cfg.encoder_embed_dim and not cfg.quantize_input
-            else None
-        )
+        self.post_extract_proj = (nn.Linear(self.embed, cfg.encoder_embed_dim)
+                                  if self.embed != cfg.encoder_embed_dim and
+                                  not cfg.quantize_input else None)
 
         self.crop_seq_to_multiple = cfg.crop_seq_to_multiple
 
@@ -1785,8 +1773,7 @@ class Wav2Vec2Model(nn.Layer):
                 vq_dim=vq_dim,
                 time_first=True,
                 weight_proj_depth=cfg.quantizer_depth,
-                weight_proj_factor=cfg.quantizer_factor,
-            )
+                weight_proj_factor=cfg.quantizer_factor, )
             self.project_q = nn.Linear(vq_dim, final_dim)
         else:
             self.project_q = nn.Linear(self.embed, final_dim)
@@ -1806,15 +1793,13 @@ class Wav2Vec2Model(nn.Layer):
                     vq_dim=vq_dim,
                     time_first=True,
                     weight_proj_depth=cfg.quantizer_depth,
-                    weight_proj_factor=cfg.quantizer_factor,
-                )
+                    weight_proj_factor=cfg.quantizer_factor, )
             self.project_inp = nn.Linear(vq_dim, cfg.encoder_embed_dim)
 
         self.mask_emb = self.create_parameter(
             shape=[cfg.encoder_embed_dim],
             default_initializer=paddle.nn.initializer.Uniform(),
-            dtype='float32',
-        )
+            dtype='float32', )
 
         encoder_cls = TransformerEncoder
 
@@ -1824,8 +1809,7 @@ class Wav2Vec2Model(nn.Layer):
         self.target_glu = None
         if cfg.target_glu:
             self.target_glu = nn.Sequential(
-                nn.Linear(final_dim, final_dim * 2), GLU()
-            )
+                nn.Linear(final_dim, final_dim * 2), GLU())
 
         self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim)
 
@@ -1840,12 +1824,11 @@ class Wav2Vec2Model(nn.Layer):
         return cls(cfg)
 
     def apply_mask(
-        self,
-        x,
-        padding_mask,
-        mask_indices=None,
-        mask_channel_indices=None,
-    ):
+            self,
+            x,
+            padding_mask,
+            mask_indices=None,
+            mask_channel_indices=None, ):
         B, T, C = x.shape
 
         if self.mask_channel_prob > 0 and self.mask_channel_before:
@@ -1857,13 +1840,10 @@ class Wav2Vec2Model(nn.Layer):
                 self.mask_channel_selection,
                 self.mask_channel_other,
                 no_overlap=self.no_mask_channel_overlap,
-                min_space=self.mask_channel_min_space,
-            )
+                min_space=self.mask_channel_min_space, )
             mask_channel_indices = (
                 paddle.to_tensor(mask_channel_indices, plcae=x.plcae)
-                .unsqueeze(1)
-                .expand([-1, T, -1])
-            )
+                .unsqueeze(1).expand([-1, T, -1]))
             x[mask_channel_indices] = 0
 
         if self.mask_prob > 0:
@@ -1879,8 +1859,7 @@ class Wav2Vec2Model(nn.Layer):
                     no_overlap=self.no_mask_overlap,
                     min_space=self.mask_min_space,
                     require_same_masks=self.cfg.require_same_masks,
-                    mask_dropout=self.cfg.mask_dropout,
-                )
+                    mask_dropout=self.cfg.mask_dropout, )
                 mask_indices = paddle.to_tensor(mask_indices, place=x.place)
             x = index_put(x, mask_indices, self.mask_emb)
         else:
@@ -1896,13 +1875,10 @@ class Wav2Vec2Model(nn.Layer):
                     self.mask_channel_selection,
                     self.mask_channel_other,
                     no_overlap=self.no_mask_channel_overlap,
-                    min_space=self.mask_channel_min_space,
-                )
+                    min_space=self.mask_channel_min_space, )
                 mask_channel_indices = (
                     paddle.to_tensor(mask_channel_indices, place=x.place)
-                    .unsqueeze(1)
-                    .expand([-1, T, -1])
-                )
+                    .unsqueeze(1).expand([-1, T, -1]))
             x = index_put(x, mask_channel_indices, 0)
 
         return x, mask_indices
@@ -1922,31 +1898,21 @@ class Wav2Vec2Model(nn.Layer):
             assert high > 1, f"{bsz,tsz,fsz}"
 
             if self.n_negatives > 0:
-                tszs = (
-                    buffered_arange(num)
-                    .unsqueeze(-1)
-                    .expand([-1, self.n_negatives])
-                    .flatten()
-                )
+                tszs = (buffered_arange(num).unsqueeze(-1)
+                        .expand([-1, self.n_negatives]).flatten())
 
                 neg_idxs = paddle.randint(
-                    low=0, high=high - 1, shape=[bsz, self.n_negatives * num]
-                )
+                    low=0, high=high - 1, shape=[bsz, self.n_negatives * num])
                 neg_idxs[neg_idxs >= tszs] += 1
 
             if self.cross_sample_negatives > 0:
-                tszs = (
-                    buffered_arange(num)
-                    .unsqueeze(-1)
-                    .expand([-1, self.cross_sample_negatives])
-                    .flatten()
-                )
+                tszs = (buffered_arange(num).unsqueeze(-1)
+                        .expand([-1, self.cross_sample_negatives]).flatten())
 
                 cross_neg_idxs = paddle.randint(
                     low=0,
                     high=cross_high - 1,
-                    shape=[bsz, self.cross_sample_negatives * num],
-                )
+                    shape=[bsz, self.cross_sample_negatives * num], )
                 cross_neg_idxs[cross_neg_idxs >= tszs] += 1
 
         if self.n_negatives > 0:
@@ -1959,10 +1925,8 @@ class Wav2Vec2Model(nn.Layer):
 
         negs = y[neg_idxs.reshape([-1])]
         negs = negs.reshape(
-            [bsz, num, self.n_negatives + self.cross_sample_negatives, fsz]
-        ).transpose(
-            [2, 0, 1, 3]
-        )  # to NxBxTxC
+            [bsz, num, self.n_negatives + self.cross_sample_negatives,
+             fsz]).transpose([2, 0, 1, 3])  # to NxBxTxC
         return negs, neg_idxs
 
     def compute_preds(self, x, y, negatives):
@@ -1987,23 +1951,21 @@ class Wav2Vec2Model(nn.Layer):
         conv_cfg_list = eval(self.cfg.conv_feature_layers)
 
         for i in range(len(conv_cfg_list)):
-            input_lengths = _conv_out_length(
-                input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2]
-            )
+            input_lengths = _conv_out_length(input_lengths, conv_cfg_list[i][1],
+                                             conv_cfg_list[i][2])
 
         return paddle.cast(input_lengths, 'int64')
 
     def forward(
-        self,
-        source,
-        padding_mask=None,
-        mask=True,
-        features_only=False,
-        layer=None,
-        mask_indices=None,
-        mask_channel_indices=None,
-        padding_count=None,
-    ):
+            self,
+            source,
+            padding_mask=None,
+            mask=True,
+            features_only=False,
+            layer=None,
+            mask_indices=None,
+            mask_channel_indices=None,
+            padding_count=None, ):
 
         if self.feature_grad_mult > 0:
             features = self.feature_extractor(source)
@@ -2022,21 +1984,18 @@ class Wav2Vec2Model(nn.Layer):
         if padding_mask is not None and padding_mask.any():
             input_lengths = (1 - paddle.cast(padding_mask, 'int64')).sum(-1)
             # apply conv formula to get real output_lengths
-            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+            output_lengths = self._get_feat_extract_output_lengths(
+                input_lengths)
 
             padding_mask = paddle.zeros(
-                features.shape[:2], dtype=features.dtype
-            )
+                features.shape[:2], dtype=features.dtype)
 
             # these two operations makes sure that all values
             # before the output lengths indices are attended to
-            padding_mask[
-                (
-                    paddle.arange(padding_mask.shape[0]),
-                    output_lengths - 1,
-                )
-            ] = 1
-            padding_mask = paddle.cast((1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])), 'bool')
+            padding_mask[(paddle.arange(padding_mask.shape[0]),
+                          output_lengths - 1, )] = 1
+            padding_mask = paddle.cast(
+                (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])), 'bool')
         else:
             padding_mask = None
 
@@ -2072,18 +2031,18 @@ class Wav2Vec2Model(nn.Layer):
                 features,
                 padding_mask,
                 mask_indices=mask_indices,
-                mask_channel_indices=mask_channel_indices,
-            )
+                mask_channel_indices=mask_channel_indices, )
             if mask_indices is not None:
-                y = unmasked_features[mask_indices].reshape(
-                    [unmasked_features.shape[0], -1, unmasked_features.shape[-1]]
-                )
+                y = unmasked_features[mask_indices].reshape([
+                    unmasked_features.shape[0], -1, unmasked_features.shape[-1]
+                ])
         else:
             x = features
             y = unmasked_features
             mask_indices = None
 
-        x, layer_results = self.encoder(x, padding_mask=padding_mask, layer=layer)
+        x, layer_results = self.encoder(
+            x, padding_mask=padding_mask, layer=layer)
 
         if features_only:
             return {
@@ -2106,8 +2065,7 @@ class Wav2Vec2Model(nn.Layer):
                 negs, _ = self.sample_negatives(
                     y,
                     mask_indices[0].sum(),
-                    padding_count=padding_count,
-                )
+                    padding_count=padding_count, )
                 y = y[mask_indices].reshape([y.shape[0], -1, y.shape[-1]])
 
             else:
@@ -2123,16 +2081,14 @@ class Wav2Vec2Model(nn.Layer):
                 negs, _ = self.sample_negatives(
                     y,
                     y.shape[1],
-                    padding_count=padding_count,
-                )
+                    padding_count=padding_count, )
 
             if self.codebook_negatives > 0:
                 cb_negs = self.quantizer.sample_from_codebook(
-                    y.shape[0] * y.shape[1], self.codebook_negatives
-                )
+                    y.shape[0] * y.shape[1], self.codebook_negatives)
                 cb_negs = cb_negs.reshape(
-                    [self.codebook_negatives, y.shape[0], y.shape[1], -1]
-                )  # order doesnt matter
+                    [self.codebook_negatives, y.shape[0], y.shape[1],
+                     -1])  # order doesnt matter
                 cb_negs = self.project_q(cb_negs)
                 negs = paddle.concat([negs, cb_negs], axis=0)
         else:
@@ -2142,15 +2098,13 @@ class Wav2Vec2Model(nn.Layer):
                 negs, _ = self.sample_negatives(
                     unmasked_features,
                     y.shape[1],
-                    padding_count=padding_count,
-                )
+                    padding_count=padding_count, )
                 negs = self.project_q(negs)
             else:
                 negs, _ = self.sample_negatives(
                     y,
                     y.shape[1],
-                    padding_count=padding_count,
-                )
+                    padding_count=padding_count, )
 
         x = x[mask_indices].reshape([x.shape[0], -1, x.shape[-1]])
 
@@ -2184,8 +2138,7 @@ class Wav2Vec2Model(nn.Layer):
 
     def extract_features(self, source, padding_mask, mask=False, layer=None):
         res = self.forward(
-            source, padding_mask, mask=mask, features_only=True, layer=layer
-        )
+            source, padding_mask, mask=mask, features_only=True, layer=layer)
         return res
 
     def get_logits(self, net_output):
@@ -2202,10 +2155,8 @@ class Wav2Vec2Model(nn.Layer):
         pen = []
 
         if "prob_perplexity" in net_output:
-            pen.append(
-                (net_output["num_vars"] - net_output["prob_perplexity"])
-                / net_output["num_vars"]
-            )
+            pen.append((net_output["num_vars"] - net_output["prob_perplexity"])
+                       / net_output["num_vars"])
 
         if "features_pen" in net_output:
             pen.append(net_output["features_pen"])
@@ -2220,39 +2171,41 @@ class Wav2Vec2Model(nn.Layer):
 
         if last_layer is not None:
             self.encoder.layers = nn.LayerList(
-                l for i, l in enumerate(self.encoder.layers) if i <= last_layer
-            )
+                l for i, l in enumerate(self.encoder.layers) if i <= last_layer)
 
 
 class ConvFeatureExtractionModel(nn.Layer):
     def __init__(
-        self,
-        conv_layers: List[Tuple[int, int, int]],
-        dropout: float = 0.0,
-        mode: str = "default",
-        conv_bias: bool = False,
-    ):
+            self,
+            conv_layers: List[Tuple[int, int, int]],
+            dropout: float=0.0,
+            mode: str="default",
+            conv_bias: bool=False, ):
         super().__init__()
 
         assert mode in {"default", "layer_norm"}
 
         def block(
-            n_in,
-            n_out,
-            k,
-            stride,
-            is_layer_norm=False,
-            is_group_norm=False,
-            conv_bias=False,
-        ):
+                n_in,
+                n_out,
+                k,
+                stride,
+                is_layer_norm=False,
+                is_group_norm=False,
+                conv_bias=False, ):
             def make_conv():
-                conv = nn.Conv1D(n_in, n_out, k, stride=stride, bias_attr=conv_bias if not conv_bias else paddle.ParamAttr())
+                conv = nn.Conv1D(
+                    n_in,
+                    n_out,
+                    k,
+                    stride=stride,
+                    bias_attr=conv_bias
+                    if not conv_bias else paddle.ParamAttr())
                 # nn.initializer.KaimingNormal()(conv.weight)
                 return conv
 
-            assert (
-                is_layer_norm and is_group_norm
-            ) == False, "layer norm and group norm are exclusive"
+            assert (is_layer_norm and is_group_norm
+                    ) is False, "layer norm and group norm are exclusive"
 
             if is_layer_norm:
                 return nn.Sequential(
@@ -2261,19 +2214,17 @@ class ConvFeatureExtractionModel(nn.Layer):
                     nn.Sequential(
                         TransposeLast(),
                         Fp32LayerNorm(dim),
-                        TransposeLast(),
-                    ),
-                    nn.GELU(),
-                )
+                        TransposeLast(), ),
+                    nn.GELU(), )
             elif is_group_norm:
                 return nn.Sequential(
                     make_conv(),
                     nn.Dropout(p=dropout),
                     Fp32GroupNorm(dim, dim),
-                    nn.GELU(),
-                )
+                    nn.GELU(), )
             else:
-                return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
+                return nn.Sequential(
+                    make_conv(), nn.Dropout(p=dropout), nn.GELU())
 
         in_d = 1
         self.conv_layers = nn.LayerList()
@@ -2289,9 +2240,7 @@ class ConvFeatureExtractionModel(nn.Layer):
                     stride,
                     is_layer_norm=mode == "layer_norm",
                     is_group_norm=mode == "default" and i == 0,
-                    conv_bias=conv_bias,
-                )
-            )
+                    conv_bias=conv_bias, ))
             in_d = dim
 
     def forward(self, x):
@@ -2312,8 +2261,7 @@ def make_conv_pos(e, k, g):
         e,
         kernel_size=k,
         padding=k // 2,
-        groups=g,
-    )
+        groups=g, )
     dropout = 0
     std = math.sqrt((4 * (1.0 - dropout)) / (k * e))
     nn.initializer.Normal(mean=0, std=std)(pos_conv.weight)
@@ -2335,8 +2283,7 @@ class TransformerEncoder(nn.Layer):
             attention_dropout=args.attention_dropout,
             activation_dropout=args.activation_dropout,
             activation_fn=args.activation_fn,
-            layer_norm_first=args.layer_norm_first,
-        )
+            layer_norm_first=args.layer_norm_first, )
         return layer
 
     def __init__(self, args: Wav2Vec2Config):
@@ -2352,40 +2299,33 @@ class TransformerEncoder(nn.Layer):
             k = max(3, args.conv_pos // num_layers)
 
             def make_conv_block(e, k, g, l):
-                return nn.Sequential(
-                    *[
-                        nn.Sequential(
-                            nn.Conv1D(
-                                e,
-                                e,
-                                kernel_size=k,
-                                padding=k // 2,
-                                groups=g,
-                            ),
-                            SamePad(k),
-                            TransposeLast(),
-                            LayerNorm(e, elementwise_affine=False),
-                            TransposeLast(),
-                            nn.GELU(),
-                        )
-                        for _ in range(l)
-                    ]
-                )
-
-            self.pos_conv = make_conv_block(
-                self.embedding_dim, k, args.conv_pos_groups, num_layers
-            )
+                return nn.Sequential(*[
+                    nn.Sequential(
+                        nn.Conv1D(
+                            e,
+                            e,
+                            kernel_size=k,
+                            padding=k // 2,
+                            groups=g, ),
+                        SamePad(k),
+                        TransposeLast(),
+                        LayerNorm(e, elementwise_affine=False),
+                        TransposeLast(),
+                        nn.GELU(), ) for _ in range(l)
+                ])
+
+            self.pos_conv = make_conv_block(self.embedding_dim, k,
+                                            args.conv_pos_groups, num_layers)
 
         else:
             self.pos_conv = make_conv_pos(
                 self.embedding_dim,
                 args.conv_pos,
-                args.conv_pos_groups,
-            )
+                args.conv_pos_groups, )
 
-        self.layers = nn.LayerList(
-            [self.build_encoder_layer(args) for _ in range(args.encoder_layers)]
-        )
+        self.layers = nn.LayerList([
+            self.build_encoder_layer(args) for _ in range(args.encoder_layers)
+        ])
         self.layer_norm_first = args.layer_norm_first
         self.layer_norm = LayerNorm(self.embedding_dim)
         self.layerdrop = args.encoder_layerdrop
@@ -2400,12 +2340,11 @@ class TransformerEncoder(nn.Layer):
         return x, layer_results
 
     def extract_features(
-        self,
-        x,
-        padding_mask=None,
-        tgt_layer=None,
-        min_layer=0,
-    ):
+            self,
+            x,
+            padding_mask=None,
+            tgt_layer=None,
+            min_layer=0, ):
 
         # import pdb
         # pdb.set_trace()
@@ -2421,15 +2360,16 @@ class TransformerEncoder(nn.Layer):
 
         # pad to the sequence length dimension
         x, pad_length = pad_to_multiple(
-            x, self.required_seq_len_multiple, dim=-2, value=0
-        )
+            x, self.required_seq_len_multiple, dim=-2, value=0)
         if pad_length > 0 and padding_mask is None:
             padding_mask = paddle.zeros([x.shape[0], x.shape[1]], dtype='bool')
             padding_mask[:, -pad_length:] = True
         else:
             padding_mask, _ = pad_to_multiple(
-                padding_mask, self.required_seq_len_multiple, dim=-1, value=True
-            )
+                padding_mask,
+                self.required_seq_len_multiple,
+                dim=-1,
+                value=True)
         x = F.dropout(x, p=self.dropout, training=self.training)
 
         # B x T x C -> T x B x C
@@ -2441,8 +2381,7 @@ class TransformerEncoder(nn.Layer):
             dropout_probability = np.random.random() if self.layerdrop > 0 else 1
             if not self.training or (dropout_probability > self.layerdrop):
                 x, (z, lr) = layer(
-                    x, self_attn_padding_mask=padding_mask, need_weights=False
-                )
+                    x, self_attn_padding_mask=padding_mask, need_weights=False)
                 if i >= min_layer:
                     layer_results.append((x, z, lr))
             if i == tgt_layer:
@@ -2460,11 +2399,8 @@ class TransformerEncoder(nn.Layer):
             x = x[:, :-pad_length]
 
             def undo_pad(a, b, c):
-                return (
-                    a[:-pad_length],
-                    b[:-pad_length] if b is not None else b,
-                    c[:-pad_length],
-                )
+                return (a[:-pad_length], b[:-pad_length]
+                        if b is not None else b, c[:-pad_length], )
 
             layer_results = [undo_pad(*u) for u in layer_results]
 
@@ -2478,6 +2414,7 @@ class TransformerEncoder(nn.Layer):
         """Upgrade a (possibly old) state dict for new versions of fairseq."""
         return state_dict
 
+
 class TransformerSentenceEncoderLayer(nn.Layer):
     """
     Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
@@ -2485,16 +2422,15 @@ class TransformerSentenceEncoderLayer(nn.Layer):
     """
 
     def __init__(
-        self,
-        embedding_dim: float = 768,
-        ffn_embedding_dim: float = 3072,
-        num_attention_heads: int = 8,
-        dropout: float = 0.1,
-        attention_dropout: float = 0.1,
-        activation_dropout: float = 0.1,
-        activation_fn: str = "relu",
-        layer_norm_first: bool = False,
-    ) -> None:
+            self,
+            embedding_dim: float=768,
+            ffn_embedding_dim: float=3072,
+            num_attention_heads: int=8,
+            dropout: float=0.1,
+            attention_dropout: float=0.1,
+            activation_dropout: float=0.1,
+            activation_fn: str="relu",
+            layer_norm_first: bool=False, ) -> None:
 
         super().__init__()
         # Initialize parameters
@@ -2508,8 +2444,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
             self.embedding_dim,
             num_attention_heads,
             dropout=attention_dropout,
-            self_attention=True,
-        )
+            self_attention=True, )
 
         self.dropout1 = nn.Dropout(dropout)
         self.dropout2 = nn.Dropout(self.activation_dropout)
@@ -2526,13 +2461,12 @@ class TransformerSentenceEncoderLayer(nn.Layer):
         self.final_layer_norm = LayerNorm(self.embedding_dim)
 
     def forward(
-        self,
-        x: paddle.Tensor,
-        self_attn_mask: paddle.Tensor = None,
-        self_attn_padding_mask: paddle.Tensor = None,
-        need_weights: bool = False,
-        att_args=None,
-    ):
+            self,
+            x: paddle.Tensor,
+            self_attn_mask: paddle.Tensor=None,
+            self_attn_padding_mask: paddle.Tensor=None,
+            need_weights: bool=False,
+            att_args=None, ):
         """
         LayerNorm is applied either before or after the self-attention/ffn
         modules similar to the original Transformer imlementation.
@@ -2547,8 +2481,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
                 value=x,
                 key_padding_mask=self_attn_padding_mask,
                 attn_mask=self_attn_mask,
-                need_weights=False,
-            )
+                need_weights=False, )
             x = self.dropout1(x)
             x = residual + x
 
@@ -2568,8 +2501,7 @@ class TransformerSentenceEncoderLayer(nn.Layer):
                 key=x,
                 value=x,
                 key_padding_mask=self_attn_padding_mask,
-                need_weights=False,
-            )
+                need_weights=False, )
 
             x = self.dropout1(x)
             x = residual + x
@@ -2595,19 +2527,20 @@ class AudioPretrainingConfig:
     sample_rate: int = field(
         default=16_000,
         metadata={
-            "help": "target sample rate. audio files will be up/down sampled to this rate"
-        },
-    )
+            "help":
+            "target sample rate. audio files will be up/down sampled to this rate"
+        }, )
     normalize: bool = field(
         default=False,
-        metadata={"help": "if set, normalizes input to have 0 mean and unit variance"},
-    )
+        metadata={
+            "help": "if set, normalizes input to have 0 mean and unit variance"
+        }, )
     enable_padding: bool = field(
-        default=False, metadata={"help": "pad shorter samples instead of cropping"}
-    )
+        default=False,
+        metadata={"help": "pad shorter samples instead of cropping"})
     max_sample_size: Optional[int] = field(
-        default=None, metadata={"help": "max sample size to crop to for batching"}
-    )
+        default=None,
+        metadata={"help": "max sample size to crop to for batching"})
     min_sample_size: Optional[int] = field(
-        default=None, metadata={"help": "min sample size to skip small examples"}
-    )
+        default=None,
+        metadata={"help": "min sample size to skip small examples"})
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
index 36d7f744d..14e6c1459 100755
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -27,8 +27,8 @@ from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import Spec
 from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
 from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
 from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
-from paddlespeech.s2t.utils.utility import log_add
 from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import log_add
 
 logger = Log(__name__).getlog()