diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml
index 010d8f155..2f63f4de0 100644
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
@@ -43,7 +43,7 @@ model:
   fc_layers_size_list: -1,
   use_gru: False 
   blank_id: 0
-  ctc_grad_norm_type: null
+  
   
 training:
   n_epoch: 65
diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml
index 70fa3fcb2..f3574e150 100644
--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
@@ -41,7 +41,7 @@ model:
   use_gru: False 
   share_rnn_weights: True
   blank_id: 0
-  ctc_grad_norm_type: null
+  
 
 training:
   n_epoch: 50
diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
index 3e07862d6..0d16bc571 100644
--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
@@ -43,7 +43,7 @@ model:
   fc_layers_size_list: 512, 256
   use_gru: False 
   blank_id: 0
-  ctc_grad_norm_type: null
+  
 
 training:
   n_epoch: 50
diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml
index 545806640..7f5930378 100644
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@@ -76,8 +76,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml
index 70a9dc6af..366d6de0f 100644
--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@@ -69,8 +69,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml
index ca934eb1d..f02f24dc6 100644
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@@ -72,8 +72,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
index 0cc0dae63..a90efe482 100644
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -29,8 +29,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null 
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
@@ -81,7 +79,7 @@ training:
   optim_conf:
     lr: 0.004
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: warmuplr     
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml
index 00240743e..a16563a59 100644
--- a/examples/librispeech/asr2/conf/transformer.yaml
+++ b/examples/librispeech/asr2/conf/transformer.yaml
@@ -30,8 +30,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null 
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml
index 6ed75be4e..36f287b10 100644
--- a/examples/ted_en_zh/st0/conf/transformer.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer.yaml
@@ -68,8 +68,6 @@ model:
     model_conf:
         asr_weight: 0.0
         ctc_weight: 0.0
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
index 7e886cca3..78887d3cd 100644
--- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
@@ -68,8 +68,6 @@ model:
     model_conf:
         asr_weight: 0.5
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml
index 3bef7bc5f..609c58240 100644
--- a/examples/ted_en_zh/st1/conf/transformer.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer.yaml
@@ -68,8 +68,6 @@ model:
     model_conf:
         asr_weight: 0.0
         ctc_weight: 0.0
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
index 3175aad9f..10eccd1eb 100644
--- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@@ -68,8 +68,6 @@ model:
     model_conf:
         asr_weight: 0.5
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml
index af05a6cea..f518cc5e7 100644
--- a/examples/timit/asr1/conf/transformer.yaml
+++ b/examples/timit/asr1/conf/transformer.yaml
@@ -66,8 +66,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.5
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml
index ba453aad7..7d841d474 100644
--- a/examples/tiny/asr0/conf/deepspeech2.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2.yaml
@@ -42,7 +42,7 @@ model:
   use_gru: False 
   share_rnn_weights: True 
   blank_id: 0
-  ctc_grad_norm_type: null
+  
 
 training:
   n_epoch: 5
diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml
index 36c774e37..393b6439f 100644
--- a/examples/tiny/asr0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml
@@ -44,7 +44,7 @@ model:
   fc_layers_size_list: 512, 256
   use_gru: True 
   blank_id: 0
-  ctc_grad_norm_type: null
+  
 
 training:
   n_epoch: 5
diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml
index 76b97adf8..ad27478de 100644
--- a/examples/tiny/asr1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@@ -76,8 +76,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml
index 5f1991f95..298518fb5 100644
--- a/examples/tiny/asr1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@@ -69,8 +69,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml
index b2937c1bd..eb8509024 100644
--- a/examples/tiny/asr1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@@ -72,8 +72,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml
index f53197561..c641d1f5b 100644
--- a/examples/tiny/asr1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@@ -66,8 +66,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml
index fc040a795..a438236d8 100644
--- a/examples/wenetspeech/asr1/conf/conformer.yaml
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
@@ -33,8 +33,6 @@ model:
     # hybrid CTC/attention
     model_conf:
         ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 
diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
index 317abc69e..f0a553ec8 100644
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -129,7 +129,7 @@ class DeepSpeech2Model(nn.Layer):
                 rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                 use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                 share_rnn_weights=True,  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-                ctc_grad_norm_type='instance', ))
+                ctc_grad_norm_type=None,))
         if config is not None:
             config.merge_from_other_cfg(default)
         return default
@@ -143,7 +143,7 @@ class DeepSpeech2Model(nn.Layer):
                  use_gru=False,
                  share_rnn_weights=True,
                  blank_id=0,
-                 ctc_grad_norm_type='instance'):
+                 ctc_grad_norm_type=None):
         super().__init__()
         self.encoder = CRNNEncoder(
             feat_size=feat_size,
@@ -220,16 +220,14 @@ class DeepSpeech2Model(nn.Layer):
         """
         model = cls(
             feat_size=dataloader.collate_fn.feature_size,
-            #feat_size=dataloader.dataset.feature_size,
             dict_size=dataloader.collate_fn.vocab_size,
-            #dict_size=dataloader.dataset.vocab_size,
             num_conv_layers=config.model.num_conv_layers,
             num_rnn_layers=config.model.num_rnn_layers,
             rnn_size=config.model.rnn_layer_size,
             use_gru=config.model.use_gru,
             share_rnn_weights=config.model.share_rnn_weights,
             blank_id=config.model.blank_id,
-            ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         infos = Checkpoint().load_parameters(
             model, checkpoint_path=checkpoint_path)
         logger.info(f"checkpoint info: {infos}")
@@ -257,7 +255,7 @@ class DeepSpeech2Model(nn.Layer):
             use_gru=config.use_gru,
             share_rnn_weights=config.share_rnn_weights,
             blank_id=config.blank_id,
-            ctc_grad_norm_type=config.ctc_grad_norm_type, )
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         return model
 
 
diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
index d134239f2..85876bce8 100644
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@@ -255,7 +255,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
                 fc_layers_size_list=[512, 256],
                 use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                 blank_id=0,  # index of blank in vocob.txt
-                ctc_grad_norm_type='instance', ))
+                ctc_grad_norm_type=None, ))
         if config is not None:
             config.merge_from_other_cfg(default)
         return default
@@ -272,7 +272,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
             fc_layers_size_list=[512, 256],
             use_gru=False,
             blank_id=0,
-            ctc_grad_norm_type='instance', ):
+            ctc_grad_norm_type=None, ):
         super().__init__()
         self.encoder = CRNNEncoder(
             feat_size=feat_size,
@@ -361,7 +361,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
             fc_layers_size_list=config.model.fc_layers_size_list,
             use_gru=config.model.use_gru,
             blank_id=config.model.blank_id,
-            ctc_grad_norm_type=config.model.ctc_grad_norm_type, )
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         infos = Checkpoint().load_parameters(
             model, checkpoint_path=checkpoint_path)
         logger.info(f"checkpoint info: {infos}")
@@ -391,7 +391,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
             fc_layers_size_list=config.fc_layers_size_list,
             use_gru=config.use_gru,
             blank_id=config.blank_id,
-            ctc_grad_norm_type=config.ctc_grad_norm_type, )
+            ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         return model
 
 
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 4f833372a..8053ed3a8 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -894,14 +894,16 @@ class U2Model(U2DecodeModel):
 
         # ctc decoder and ctc loss
         model_conf = configs['model_conf']
+        dropout_rate = model_conf.get('ctc_dropout_rate', 0.0)
+        grad_norm_type = model_conf.get('ctc_grad_norm_type', None)
         ctc = CTCDecoder(
             odim=vocab_size,
             enc_n_units=encoder.output_size(),
             blank_id=0,
-            dropout_rate=model_conf['ctc_dropoutrate'],
+            dropout_rate=dropout_rate,
             reduction=True,  # sum
             batch_average=True,  # sum / batch_size
-            grad_norm_type=model_conf['ctc_grad_norm_type'])
+            grad_norm_type=grad_norm_type)
 
         return vocab_size, encoder, decoder, ctc
 
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index a83e67078..3a23804fe 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -655,14 +655,16 @@ class U2STModel(U2STBaseModel):
                                          **configs['decoder_conf'])
             # ctc decoder and ctc loss
             model_conf = configs['model_conf']
+            dropout_rate = model_conf.get('ctc_dropout_rate', 0.0)
+            grad_norm_type = model_conf.get('ctc_grad_norm_type', None)
             ctc = CTCDecoder(
                 odim=vocab_size,
                 enc_n_units=encoder.output_size(),
                 blank_id=0,
-                dropout_rate=model_conf['ctc_dropoutrate'],
+                dropout_rate=dropout_rate,
                 reduction=True,  # sum
                 batch_average=True,  # sum / batch_size
-                grad_norm_type=model_conf['ctc_grad_norm_type'])
+                grad_norm_type=grad_norm_type)
 
             return vocab_size, encoder, (st_decoder, decoder, ctc)
         else:
diff --git a/tests/unit/asr/u2_model_test.py b/tests/unit/asr/u2_model_test.py
index f46c6d403..5b11d2ad3 100644
--- a/tests/unit/asr/u2_model_test.py
+++ b/tests/unit/asr/u2_model_test.py
@@ -74,8 +74,6 @@ class TestU2Model(unittest.TestCase):
             model_conf:
                 ctc_weight: 0.3
                 lsm_weight: 0.1     # label smoothing option
-                ctc_dropoutrate: 0.0
-                ctc_grad_norm_type: null
                 length_normalized_loss: false
         """
         cfg = CN().load_cfg(conf_str)
@@ -128,8 +126,6 @@ class TestU2Model(unittest.TestCase):
             model_conf:
                 ctc_weight: 0.3
                 lsm_weight: 0.1     # label smoothing option
-                ctc_dropoutrate: 0.0
-                ctc_grad_norm_type: null
                 length_normalized_loss: false
         """
         cfg = CN().load_cfg(conf_str)