diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py
index da3b1acb..ed209f3d 100644
--- a/deepspeech/__init__.py
+++ b/deepspeech/__init__.py
@@ -356,35 +356,7 @@ if not hasattr(paddle.Tensor, 'tolist'):
     setattr(paddle.Tensor, 'tolist', tolist)
 
 
-########### hcak paddle.nn.functional #############
-# hack loss
-def ctc_loss(logits,
-             labels,
-             input_lengths,
-             label_lengths,
-             blank=0,
-             reduction='mean',
-             norm_by_times=True):
-    #logger.info("my ctc loss with norm by times")
-    ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
-    loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
-                                           input_lengths, label_lengths)
-
-    loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
-    assert reduction in ['mean', 'sum', 'none']
-    if reduction == 'mean':
-        loss_out = paddle.mean(loss_out / label_lengths)
-    elif reduction == 'sum':
-        loss_out = paddle.sum(loss_out)
-    return loss_out
-
-
-logger.debug(
-    "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
-)
-F.ctc_loss = ctc_loss
-
-########### hcak paddle.nn #############
+########### hack paddle.nn #############
 from paddle.nn import Layer
 from typing import Optional
 from typing import Mapping
@@ -534,3 +506,5 @@ if not hasattr(paddle.nn, 'LayerDict'):
     logger.debug(
         "register user LayerDict to paddle.nn, remove this when fixed!")
     setattr(paddle.nn, 'LayerDict', LayerDict)
+
+
diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py
index 565a11e1..e0c8006d 100644
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import paddle
 from paddle import nn
+from typing import Union
 from paddle.nn import functional as F
 from typeguard import check_argument_types
 
@@ -40,7 +41,7 @@ class CTCDecoderBase(nn.Layer):
                  dropout_rate: float=0.0,
                  reduction: bool=True,
                  batch_average: bool=True,
-                 grad_norm_type: str="instance"):
+                 grad_norm_type: Union[str, None]=None):
         """CTC decoder
 
         Args:
@@ -49,7 +50,7 @@ class CTCDecoderBase(nn.Layer):
             dropout_rate (float): dropout rate (0.0 ~ 1.0)
             reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
             batch_average (bool): do batch dim wise average.
-            grad_norm_type (str): one of 'instance', 'batch', 'frame', None.
+            grad_norm_type (str): Default, None. one of 'instance', 'batch', 'frame', None. 
         """
         assert check_argument_types()
         super().__init__()
diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py
index e06f26f8..e1138810 100644
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@@ -54,7 +54,7 @@ class CTCLoss(nn.Layer):
             self.norm_by_total_logits_len = True
         else:
             raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}")
-        self.kwargs = {
+        kwargs = {
             "norm_by_times": self.norm_by_times,
             "norm_by_batchsize": self.norm_by_batchsize,
             "norm_by_total_logits_len": self.norm_by_total_logits_len,
@@ -66,10 +66,9 @@ class CTCLoss(nn.Layer):
         except ValueError:
             # Some function, e.g. built-in function, are failed
             param = {}
-        self._kwargs = {k: v for k, v in self.kwargs.items() if k in param}
-        _notin = {k: v for k, v in self.kwargs.items() if k not in param}
+        self._kwargs = {k: v for k, v in kwargs.items() if k in param}
+        _notin = {k: v for k, v in kwargs.items() if k not in param}
         logger.info(f"{self.loss} kwargs:{self._kwargs}, not support: {_notin}")
-        #self.loss_fn = partial(self.loss.forward, **_kwargs)
 
     def forward(self, logits, ys_pad, hlens, ys_lens):
         """Compute CTC loss.
@@ -89,8 +88,7 @@ class CTCLoss(nn.Layer):
         # logits: (B, L, D) -> (L, B, D)
         logits = logits.transpose([1, 0, 2])
         ys_pad = ys_pad.astype(paddle.int32)
-        #loss = self.loss_fn(logits, ys_pad, hlens, ys_lens)
-        loss = self.loss(logits, ys_pad, hlens, ys_lens)
+        loss = self.loss(logits, ys_pad, hlens, ys_lens, **self._kwargs)
         if self.batch_average:
             # Batch-size average
             loss = loss / B
diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml
index ffefaeb3..7a198991 100644
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -41,7 +41,7 @@ model:
   use_gru: True 
   share_rnn_weights: False
   blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 
 training:
   n_epoch: 80
diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml
index cac599dc..c15e71a3 100644
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@@ -43,7 +43,7 @@ model:
   fc_layers_size_list: -1,
   use_gru: False 
   blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
   
 training:
   n_epoch: 50
diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml
index 9b563da2..8682538b 100644
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@@ -77,7 +77,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml
index dfa9a4b0..71cd044e 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -72,7 +72,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml
index 47ef9421..8afaabf4 100644
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -41,7 +41,7 @@ model:
   use_gru: False 
   share_rnn_weights: True
   blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 
 training:
   n_epoch: 50
diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/s0/conf/deepspeech2_online.yaml
index e2f91094..d6ab9523 100644
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@@ -43,7 +43,7 @@ model:
   fc_layers_size_list: 512, 256
   use_gru: False 
   blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 
 training:
   n_epoch: 50
diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml
index 9936450b..4d0e6ceb 100644
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@@ -77,7 +77,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml
index 44f3e5a7..c7b53f95 100644
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -70,7 +70,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml
index a05e37dd..3bc942dc 100644
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -73,7 +73,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml
index c9dc1413..3cc17004 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -68,7 +68,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null 
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml
index 872b560b..afd2b051 100644
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@@ -77,7 +77,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml
index 132a4f9d..721bb7d9 100644
--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@@ -70,7 +70,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml
index bc87466e..ef87753c 100644
--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@@ -73,7 +73,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml
index 8c03e328..8a7e10f0 100644
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
@@ -69,7 +69,7 @@ model:
         asr_weight: 0.0
         ctc_weight: 0.0
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
index cbfae93e..f8dc383f 100644
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
@@ -69,7 +69,7 @@ model:
         asr_weight: 0.5
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml
index e138fbbe..d3ced898 100644
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@@ -67,7 +67,7 @@ model:
     model_conf:
         ctc_weight: 0.5
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index a7940cb2..621b372c 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -42,7 +42,7 @@ model:
   use_gru: False 
   share_rnn_weights: True 
   blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 
 training:
   n_epoch: 10
diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/s0/conf/deepspeech2_online.yaml
index 7e30409f..5a8294ad 100644
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
@@ -44,7 +44,7 @@ model:
   fc_layers_size_list: 512, 256
   use_gru: True 
   blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
 
 training:
   n_epoch: 10
diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml
index f3c7e1dd..b14b4b21 100644
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -77,7 +77,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml
index 83005754..38edbf35 100644
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -70,7 +70,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml
index 628e3b77..0b06b2b7 100644
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@@ -73,7 +73,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index 27ffcae4..1c6f9e02 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -67,7 +67,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false