diff --git a/deepspeech/modules/activation.py b/deepspeech/modules/activation.py
index 72ccb5346..ecaca5bca 100644
--- a/deepspeech/modules/activation.py
+++ b/deepspeech/modules/activation.py
@@ -25,7 +25,9 @@ from paddle.nn import initializer as I
 
 logger = logging.getLogger(__name__)
 
-__all__ = ['brelu', "glu"]
+__all__ = [
+    "brelu", "glu", "GLU", "LinearGLUBlock", "ConstantPad2d", "ConvGLUBlock"
+]
 
 
 def brelu(x, t_min=0.0, t_max=24.0, name=None):
@@ -73,6 +75,8 @@ def glu(x, dim=-1):
 
 # TODO(Hui Zhang): remove this activation
 if not hasattr(nn.functional, 'glu'):
+    logger.warn(
+        "register user glu to paddle.nn.functional, remove this when fixed!")
     setattr(nn.functional, 'glu', glu)
 
 
diff --git a/deepspeech/modules/conformer_convolution.py b/deepspeech/modules/conformer_convolution.py
index 5416bd898..4c3eb9f4f 100644
--- a/deepspeech/modules/conformer_convolution.py
+++ b/deepspeech/modules/conformer_convolution.py
@@ -58,7 +58,8 @@ class ConvolutionModule(nn.Layer):
             kernel_size=1,
             stride=1,
             padding=0,
-            bias=None if bias else False,  # None for True as default
+            bias_attr=None
+            if bias else False,  # None for True, using bias as default config
         )
 
         # self.lorder is used to distinguish if it's a causal convolution,
@@ -82,7 +83,8 @@ class ConvolutionModule(nn.Layer):
             stride=1,
             padding=padding,
             groups=channels,
-            bias=None if bias else False,  # None for True as default
+            bias_attr=None
+            if bias else False,  # None for True, using bias as default config
         )
 
         assert norm in ['batch_norm', 'layer_norm']
@@ -99,7 +101,8 @@ class ConvolutionModule(nn.Layer):
             kernel_size=1,
             stride=1,
             padding=0,
-            bias=None if bias else False,  # None for True as default
+            bias_attr=None
+            if bias else False,  # None for True, using bias as default config
         )
         self.activation = activation
 
@@ -109,10 +112,10 @@ class ConvolutionModule(nn.Layer):
         Args:
             x (paddle.Tensor): Input tensor (#batch, time, channels).
             cache (paddle.Tensor): left context cache, it is only
-                used in causal convolution. (#batch, channels, time)
+                used in causal convolution. (#batch, channels, time')
         Returns:
             paddle.Tensor: Output tensor (#batch, time, channels).
-            paddle.Tensor: Output cache tensor (#batch, channels, time)
+            paddle.Tensor: Output cache tensor (#batch, channels, time')
         """
         # exchange the temporal dimension and the feature dimension
         x = x.transpose([0, 2, 1])  # [B, C, T]