diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml
index bdfa4219..1dc8581e 100644
--- a/examples/aishell/asr0/conf/deepspeech2.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2.yaml
@@ -1,68 +1,64 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.0
-  max_input_len: 27.0 # second
-  min_output_len: 0.0
-  max_output_len: .inf
-  min_output_input_ratio: 0.00
-  max_output_input_ratio: .inf
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+min_input_len: 0.0
+max_input_len: 27.0 # second
+min_output_len: 0.0
+max_output_len: .inf
+min_output_input_ratio: 0.00
+max_output_input_ratio: .inf
 
-collator:
-  batch_size: 64 # one gpu
-  mean_std_filepath: data/mean_std.json
-  unit_type: char
-  vocab_filepath: data/lang_char/vocab.txt 
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
-  spm_model_prefix: 
-  spectrum_type: linear
-  feat_dim: 
-  delta_delta: False
-  stride_ms: 10.0
-  window_ms: 20.0
-  n_fft: None
-  max_freq: None
-  target_sample_rate: 16000
-  use_dB_normalization: True
-  target_dB: -20
-  dither: 1.0
-  keep_transcription_text: False
-  sortagrad: True
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+batch_size: 64 # one gpu
+mean_std_filepath: data/mean_std.json
+unit_type: char
+vocab_filepath: data/lang_char/vocab.txt 
+augmentation_config: conf/augmentation.json
+random_seed: 0
+spm_model_prefix: 
+spectrum_type: linear
+feat_dim: 
+delta_delta: False
+stride_ms: 10.0
+window_ms: 20.0
+n_fft: None
+max_freq: None
+target_sample_rate: 16000
+use_dB_normalization: True
+target_dB: -20
+dither: 1.0
+keep_transcription_text: False
+sortagrad: True
+shuffle_method: batch_shuffle
+num_workers: 2
 
-model:
-  num_conv_layers: 2
-  num_rnn_layers: 3
-  rnn_layer_size: 1024
-  use_gru: True 
-  share_rnn_weights: False
-  blank_id: 0
-  ctc_grad_norm_type: instance 
+############################################
+#           Network Architecture           #
+############################################
+num_conv_layers: 2
+num_rnn_layers: 3
+rnn_layer_size: 1024
+use_gru: True 
+share_rnn_weights: False
+blank_id: 0
+ctc_grad_norm_type: instance 
 
-training:
-  n_epoch: 80
-  accum_grad: 1
-  lr: 2e-3
-  lr_decay: 0.83
-  weight_decay: 1e-06
-  global_grad_clip: 3.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-  
-decoding:
-  batch_size: 128
-  error_rate_type: cer 
-  decoding_method: ctc_beam_search
-  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
-  alpha: 1.9
-  beta: 5.0
-  beam_size: 300
-  cutoff_prob: 0.99
-  cutoff_top_n: 40
-  num_proc_bsearch: 10
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 80
+accum_grad: 1
+lr: 2e-3
+lr_decay: 0.83
+weight_decay: 1e-06
+global_grad_clip: 3.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml
index 2f63f4de..c49973a2 100644
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
@@ -1,70 +1,68 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.0
-  max_input_len: 27.0 # second
-  min_output_len: 0.0
-  max_output_len: .inf
-  min_output_input_ratio: 0.00
-  max_output_input_ratio: .inf
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+min_input_len: 0.0
+max_input_len: 27.0 # second
+min_output_len: 0.0
+max_output_len: .inf
+min_output_input_ratio: 0.00
+max_output_input_ratio: .inf
 
-collator:
-  batch_size: 64 # one gpu
-  mean_std_filepath: data/mean_std.json
-  unit_type: char
-  vocab_filepath: data/lang_char/vocab.txt 
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
-  spm_model_prefix: 
-  spectrum_type: linear #linear, mfcc, fbank
-  feat_dim: 
-  delta_delta: False
-  stride_ms: 10.0
-  window_ms: 20.0
-  n_fft: None
-  max_freq: None
-  target_sample_rate: 16000
-  use_dB_normalization: True
-  target_dB: -20
-  dither: 1.0
-  keep_transcription_text: False
-  sortagrad: True
-  shuffle_method: batch_shuffle
-  num_workers: 0
+###########################################
+#              Dataloader                 #
+###########################################
+batch_size: 64 # one gpu
+mean_std_filepath: data/mean_std.json
+unit_type: char
+vocab_filepath: data/lang_char/vocab.txt 
+augmentation_config: conf/augmentation.json
+random_seed: 0
+spm_model_prefix: 
+spectrum_type: linear #linear, mfcc, fbank
+feat_dim: 
+delta_delta: False
+stride_ms: 10.0
+window_ms: 20.0
+n_fft: None
+max_freq: None
+target_sample_rate: 16000
+use_dB_normalization: True
+target_dB: -20
+dither: 1.0
+keep_transcription_text: False
+sortagrad: True
+shuffle_method: batch_shuffle
+num_workers: 0
 
-model:
-  num_conv_layers: 2
-  num_rnn_layers: 5
-  rnn_layer_size: 1024
-  rnn_direction: forward # [forward, bidirect]
-  num_fc_layers: 0
-  fc_layers_size_list: -1,
-  use_gru: False 
-  blank_id: 0
+############################################
+#           Network Architecture           #
+############################################
+num_conv_layers: 2
+num_rnn_layers: 5
+rnn_layer_size: 1024
+rnn_direction: forward # [forward, bidirect]
+num_fc_layers: 0
+fc_layers_size_list: -1,
+use_gru: False 
+blank_id: 0
   
   
-training:
-  n_epoch: 65
-  accum_grad: 1
-  lr: 5e-4
-  lr_decay: 0.93
-  weight_decay: 1e-06
-  global_grad_clip: 3.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 65
+accum_grad: 1
+lr: 5e-4
+lr_decay: 0.93
+weight_decay: 1e-06
+global_grad_clip: 3.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
+
   
-decoding:
-  batch_size: 32
-  error_rate_type: cer 
-  decoding_method: ctc_beam_search
-  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
-  alpha: 2.2 #1.9
-  beta: 4.3
-  beam_size: 300
-  cutoff_prob: 0.99
-  cutoff_top_n: 40
-  num_proc_bsearch: 10
diff --git a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml
new file mode 100644
index 00000000..9de06711
--- /dev/null
+++ b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml
@@ -0,0 +1,10 @@
+chunk_batch_size: 32
+error_rate_type: cer 
+decoding_method: ctc_beam_search
+lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
+alpha: 2.2 #1.9
+beta: 4.3
+beam_size: 300
+cutoff_prob: 0.99
+cutoff_top_n: 40
+num_proc_bsearch: 10
diff --git a/examples/aishell/asr0/conf/tuning/decode.yaml b/examples/aishell/asr0/conf/tuning/decode.yaml
new file mode 100644
index 00000000..5778e656
--- /dev/null
+++ b/examples/aishell/asr0/conf/tuning/decode.yaml
@@ -0,0 +1,10 @@
+decode_batch_size: 128
+error_rate_type: cer 
+decoding_method: ctc_beam_search
+lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
+alpha: 1.9
+beta: 5.0
+beam_size: 300
+cutoff_prob: 0.99
+cutoff_top_n: 40
+num_proc_bsearch: 10
diff --git a/examples/aishell/asr0/local/test.sh b/examples/aishell/asr0/local/test.sh
index 8cbff235..463593ef 100755
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-model_type=$3
+decode_config_path=$2
+ckpt_prefix=$3
+model_type=$4
 
 # download language model
 bash local/download_lm_ch.sh
@@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
diff --git a/examples/aishell/asr0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh
index 4f5e5c8b..7a4b87f8 100755
--- a/examples/aishell/asr0/local/test_export.sh
+++ b/examples/aishell/asr0/local/test_export.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-jit_model_export_path=$2
-model_type=$3
+decode_config_path=$2
+jit_model_export_path=$3
+model_type=$4
 
 # download language model
 bash local/download_lm_ch.sh > /dev/null 2>&1
@@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test_export.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${jit_model_export_path}.rsl \
 --export_path ${jit_model_export_path} \
 --model_type ${model_type}
diff --git a/examples/aishell/asr0/local/test_wav.sh b/examples/aishell/asr0/local/test_wav.sh
index 4a6d92fb..62b005a6 100755
--- a/examples/aishell/asr0/local/test_wav.sh
+++ b/examples/aishell/asr0/local/test_wav.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
+if [ $# != 5 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file"
     exit -1
 fi
 
@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-model_type=$3
-audio_file=$4
+decode_config_path=$2
+ckpt_prefix=$3
+model_type=$4
+audio_file=$5
 
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
@@ -33,6 +34,7 @@ fi
 python3 -u ${BIN_DIR}/test_wav.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type} \
diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh
index 270b88fc..15685f21 100755
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline    # offline or online
 audio_file=data/demo_01_03.wav
@@ -34,7 +35,7 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
@@ -44,11 +45,11 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # test export ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
 
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
     # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
 fi
diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml
index 50eaef98..68e852ba 100644
--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@@ -1,103 +1,95 @@
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        cnn_module_kernel: 15
-        use_cnn_module: True
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
-        causal: true
-        use_dynamic_chunk: true
-        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
-        use_dynamic_left_chunk: false
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+###########################################
+#                   Data                  #
+###########################################
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
 
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+###########################################
+#              Dataloader                 #
+###########################################
 
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 64
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'char'
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 64
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
-
-
-training:
-  n_epoch: 240 
-  accum_grad: 2
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.002
-    weight_decay: 1e-6
-  scheduler: warmuplr
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  beam_size: 10
-  batch_size: 128
-  error_rate_type: cer 
-  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
 
diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml
index 9de28c12..775a4527 100644
--- a/examples/aishell/asr1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@@ -1,97 +1,89 @@
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        cnn_module_kernel: 15
-        use_cnn_module: True
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 64
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'char'
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 64
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  num_workers: 8
-  subsampling_factor: 1
-  num_encs: 1
-
-
-training:
-  n_epoch: 240 
-  accum_grad: 2
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.002
-    weight_decay: 1e-6
-  scheduler: warmuplr
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  beam_size: 10
-  batch_size: 128
-  error_rate_type: cer 
-  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 240 
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml
index 7c5fa624..9d294653 100644
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@@ -1,95 +1,85 @@
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
-
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
-
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
+###########################################
+#                   Data                  #
+###########################################
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-
-
-collator:
-  unit_type: 'char'
-  vocab_filepath: data/lang_char/vocab.txt 
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 64 
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
-
-
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
-training:
-  n_epoch: 240 
-  accum_grad: 2
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.002
-    weight_decay: 1e-6
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
 
+###########################################
+#              Dataloader                 #
+###########################################
+unit_type: 'char'
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 64 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config: conf/preprocess.yaml 
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
-decoding:
-  beam_size: 10
-  batch_size: 128
-  error_rate_type: cer 
-  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml
new file mode 100644
index 00000000..72ede927
--- /dev/null
+++ b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml
@@ -0,0 +1,11 @@
+beam_size: 10
+decode_batch_size: 128
+error_rate_type: cer 
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
diff --git a/examples/aishell/asr1/conf/tuning/decode.yaml b/examples/aishell/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000..72ede927
--- /dev/null
+++ b/examples/aishell/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,11 @@
+beam_size: 10
+decode_batch_size: 128
+error_rate_type: cer 
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh
index c65d611c..14d91d68 100755
--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 batch_size=1
 output_dir=${ckpt_prefix}
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
---opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 
 if [ $? -ne 0 ]; then
     echo "Failed in ctc alignment!"
diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh
index da159de7..65b884e5 100755
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
@@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/aishell/asr1/local/test_wav.sh b/examples/aishell/asr1/local/test_wav.sh
index f85c1a47..d029f2fd 100755
--- a/examples/aishell/asr1/local/test_wav.sh
+++ b/examples/aishell/asr1/local/test_wav.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-audio_file=$3
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4
 
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
@@ -42,10 +43,11 @@ for type in  attention_rescoring; do
     python3 -u ${BIN_DIR}/test_wav.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
     --audio_file ${audio_file}
 
     if [ $? -ne 0 ]; then
diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh
index d07a4ed5..c54dae9c 100644
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/conformer.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=20
 audio_file=data/demo_01_03.wav
 
@@ -32,18 +33,18 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
 
 # Not supported at now!!!
diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml
index 69959c68..19e783a6 100644
--- a/examples/callcenter/asr1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml
@@ -1,120 +1,98 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.5
-  max_input_len: 20.0 # second
-  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
-  
-
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'char'
-  spm_model_prefix: ''
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 32
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 8000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True 
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
-
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
-        causal: true
-        use_dynamic_chunk: true
-        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
-        use_dynamic_left_chunk: false
-
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
-
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
+  
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'char'
+spm_model_prefix: ''
+preprocess_config: conf/preprocess.yaml
+batch_size: 32
+raw_wav: True  # use raw_wav or kaldi feature
+spectrum_type: fbank #linear, mfcc, fbank
+feat_dim: 80
+delta_delta: False
+dither: 1.0
+target_sample_rate: 8000
+max_freq: None
+n_fft: None
+stride_ms: 10.0
+window_ms: 25.0
+use_dB_normalization: True 
+target_dB: -20
+random_seed: 0
+keep_transcription_text: False
+sortagrad: True 
+shuffle_method: batch_shuffle
+num_workers: 2
 
-training:
-  n_epoch: 240
-  accum_grad: 4
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.001
-    weight_decay: 1e-6
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
 
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
 
-decoding:
-  batch_size: 128
-  error_rate_type: cer 
-  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: true  # simulate streaming inference. Defaults to False.
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 240
+accum_grad: 4
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml
index 80c15abb..f6fcb949 100644
--- a/examples/callcenter/asr1/conf/conformer.yaml
+++ b/examples/callcenter/asr1/conf/conformer.yaml
@@ -1,117 +1,92 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.5
-  max_input_len: 20.0 # second
-  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.0
-  max_output_input_ratio: .inf 
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'char'
-  spm_model_prefix: ''
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 32
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 8000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True 
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'char'
+spm_model_prefix: ''
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 64
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
-
-
-training:
-  n_epoch: 100 # 50 will be lowest 
-  accum_grad: 4
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.002
-    weight_decay: 1e-6
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-
-
-decoding:
-  batch_size: 128
-  error_rate_type: cer 
-  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
 
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 100 # 50 will be lowest 
+accum_grad: 4
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml
index f7f4c58d..877e7d5a 100644
--- a/examples/callcenter/asr1/conf/preprocess.yaml
+++ b/examples/callcenter/asr1/conf/preprocess.yaml
@@ -1,7 +1,7 @@
 process:
   # extract kaldi fbank from PCM
   - type: fbank_kaldi
-    fs: 16000
+    fs: 8000
     n_mels: 80
     n_shift: 160
     win_length: 400
diff --git a/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml
new file mode 100644
index 00000000..49a6a114
--- /dev/null
+++ b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml
@@ -0,0 +1,11 @@
+decode_batch_size: 128
+error_rate_type: cer 
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: true  # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/callcenter/asr1/conf/tuning/decode.yaml b/examples/callcenter/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000..d2e0b72d
--- /dev/null
+++ b/examples/callcenter/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,13 @@
+decode_batch_size: 128
+error_rate_type: cer 
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
+
+
diff --git a/examples/callcenter/asr1/local/align.sh b/examples/callcenter/asr1/local/align.sh
index 681c77ed..1397ae57 100755
--- a/examples/callcenter/asr1/local/align.sh
+++ b/examples/callcenter/asr1/local/align.sh
@@ -1,7 +1,7 @@
 #! /usr/bin/env bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 ckpt_name=$(basename ${ckpt_prefxi})
 
@@ -25,9 +26,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
---opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 
 if [ $? -ne 0 ]; then
     echo "Failed in ctc alignment!"
diff --git a/examples/callcenter/asr1/local/test.sh b/examples/callcenter/asr1/local/test.sh
index fc43c5a2..b7ff722a 100755
--- a/examples/callcenter/asr1/local/test.sh
+++ b/examples/callcenter/asr1/local/test.sh
@@ -1,7 +1,7 @@
 #! /usr/bin/env bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
+
 
 ckpt_name=$(basename ${ckpt_prefxi})
 
@@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
@@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh
index e9be3d03..0c7ffc1e 100644
--- a/examples/callcenter/asr1/run.sh
+++ b/examples/callcenter/asr1/run.sh
@@ -4,8 +4,9 @@ source path.sh
 
 gpus=0,1,2,3
 stage=0
-stop_stage=100
+stop_stage=50
 conf_path=conf/conformer.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=20
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -31,15 +32,15 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
     # export ckpt avg_n
     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi
diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml
index f3574e15..0b0a1550 100644
--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
@@ -1,68 +1,65 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev-clean
-  test_manifest: data/manifest.test-clean
-  min_input_len: 0.0
-  max_input_len: 30.0 # second
-  min_output_len: 0.0
-  max_output_len: .inf
-  min_output_input_ratio: 0.00
-  max_output_input_ratio: .inf
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev-clean
+test_manifest: data/manifest.test-clean
+min_input_len: 0.0
+max_input_len: 30.0 # second
+min_output_len: 0.0
+max_output_len: .inf
+min_output_input_ratio: 0.00
+max_output_input_ratio: .inf
 
-collator:
-  batch_size: 20
-  mean_std_filepath: data/mean_std.json
-  unit_type: char
-  vocab_filepath: data/lang_char/vocab.txt
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
-  spm_model_prefix: 
-  spectrum_type: linear
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 20.0
-  delta_delta: False
-  dither: 1.0
-  use_dB_normalization: True 
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+batch_size: 20
+mean_std_filepath: data/mean_std.json
+unit_type: char
+vocab_filepath: data/lang_char/vocab.txt
+augmentation_config: conf/augmentation.json
+random_seed: 0
+spm_model_prefix: 
+spectrum_type: linear
+feat_dim: 
+target_sample_rate: 16000
+max_freq: None
+n_fft: None
+stride_ms: 10.0
+window_ms: 20.0
+delta_delta: False
+dither: 1.0
+use_dB_normalization: True 
+target_dB: -20
+random_seed: 0
+keep_transcription_text: False
+sortagrad: True 
+shuffle_method: batch_shuffle
+num_workers: 2
 
-model:
-  num_conv_layers: 2
-  num_rnn_layers: 3
-  rnn_layer_size: 2048
-  use_gru: False 
-  share_rnn_weights: True
-  blank_id: 0
+############################################
+#           Network Architecture           #
+############################################
+num_conv_layers: 2
+num_rnn_layers: 3
+rnn_layer_size: 2048
+use_gru: False 
+share_rnn_weights: True
+blank_id: 0
   
 
-training:
-  n_epoch: 50
-  accum_grad: 1
-  lr: 1e-3
-  lr_decay: 0.83
-  weight_decay: 1e-06
-  global_grad_clip: 5.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-decoding:
-  batch_size: 128
-  error_rate_type: wer
-  decoding_method: ctc_beam_search
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 1.9
-  beta: 0.3
-  beam_size: 500
-  cutoff_prob: 1.0
-  cutoff_top_n: 40
-  num_proc_bsearch: 8
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 50
+accum_grad: 1
+lr: 1e-3
+lr_decay: 0.83
+weight_decay: 1e-06
+global_grad_clip: 5.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
index 0d16bc57..8bd5a672 100644
--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
@@ -1,70 +1,67 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev-clean
-  test_manifest: data/manifest.test-clean
-  min_input_len: 0.0
-  max_input_len: 30.0 # second
-  min_output_len: 0.0
-  max_output_len: .inf
-  min_output_input_ratio: 0.00
-  max_output_input_ratio: .inf
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev-clean
+test_manifest: data/manifest.test-clean
+min_input_len: 0.0
+max_input_len: 30.0 # second
+min_output_len: 0.0
+max_output_len: .inf
+min_output_input_ratio: 0.00
+max_output_input_ratio: .inf
 
-collator:
-  batch_size: 15
-  mean_std_filepath: data/mean_std.json
-  unit_type: char
-  vocab_filepath: data/lang_char/vocab.txt
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
-  spm_model_prefix: 
-  spectrum_type: linear
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 20.0
-  delta_delta: False
-  dither: 1.0
-  use_dB_normalization: True 
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 0
+###########################################
+#              Dataloader                 #
+###########################################
+batch_size: 15
+mean_std_filepath: data/mean_std.json
+unit_type: char
+vocab_filepath: data/lang_char/vocab.txt
+augmentation_config: conf/augmentation.json
+random_seed: 0
+spm_model_prefix: 
+spectrum_type: linear
+feat_dim: 
+target_sample_rate: 16000
+max_freq: None
+n_fft: None
+stride_ms: 10.0
+window_ms: 20.0
+delta_delta: False
+dither: 1.0
+use_dB_normalization: True 
+target_dB: -20
+random_seed: 0
+keep_transcription_text: False
+sortagrad: True 
+shuffle_method: batch_shuffle
+num_workers: 0
 
-model:
-  num_conv_layers: 2
-  num_rnn_layers: 3
-  rnn_layer_size: 2048
-  rnn_direction: forward
-  num_fc_layers: 2
-  fc_layers_size_list: 512, 256
-  use_gru: False 
-  blank_id: 0
+############################################
+#           Network Architecture           #
+############################################
+num_conv_layers: 2
+num_rnn_layers: 3
+rnn_layer_size: 2048
+rnn_direction: forward
+num_fc_layers: 2
+fc_layers_size_list: 512, 256
+use_gru: False 
+blank_id: 0
   
 
-training:
-  n_epoch: 50
-  accum_grad: 4
-  lr: 1e-3
-  lr_decay: 0.83
-  weight_decay: 1e-06
-  global_grad_clip: 5.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-decoding:
-  batch_size: 128
-  error_rate_type: wer
-  decoding_method: ctc_beam_search
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 1.9
-  beta: 0.3
-  beam_size: 500
-  cutoff_prob: 1.0
-  cutoff_top_n: 40
-  num_proc_bsearch: 8
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 50
+accum_grad: 4
+lr: 1e-3
+lr_decay: 0.83
+weight_decay: 1e-06
+global_grad_clip: 5.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml
new file mode 100644
index 00000000..e07026ba
--- /dev/null
+++ b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml
@@ -0,0 +1,10 @@
+decode_batch_size: 128
+error_rate_type: wer
+decoding_method: ctc_beam_search
+lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+alpha: 1.9
+beta: 0.3
+beam_size: 500
+cutoff_prob: 1.0
+cutoff_top_n: 40
+num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/librispeech/asr0/conf/tuning/decode.yaml b/examples/librispeech/asr0/conf/tuning/decode.yaml
new file mode 100644
index 00000000..e07026ba
--- /dev/null
+++ b/examples/librispeech/asr0/conf/tuning/decode.yaml
@@ -0,0 +1,10 @@
+decode_batch_size: 128
+error_rate_type: wer
+decoding_method: ctc_beam_search
+lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+alpha: 1.9
+beta: 0.3
+beam_size: 500
+cutoff_prob: 1.0
+cutoff_top_n: 40
+num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/librispeech/asr0/local/test.sh b/examples/librispeech/asr0/local/test.sh
index a627ef72..ea40046b 100755
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-model_type=$3
+decode_config_path=$2
+ckpt_prefix=$3
+model_type=$4
 
 # download language model
 bash local/download_lm_en.sh
@@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
diff --git a/examples/librispeech/asr0/local/test_wav.sh b/examples/librispeech/asr0/local/test_wav.sh
index e8337da7..25cfc45e 100755
--- a/examples/librispeech/asr0/local/test_wav.sh
+++ b/examples/librispeech/asr0/local/test_wav.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
+if [ $# != 5 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file"
     exit -1
 fi
 
@@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-model_type=$3
-audio_file=$4
+decode_config_path=$2
+ckpt_prefix=$3
+model_type=$4
+audio_file=$5
 
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
@@ -33,6 +34,7 @@ fi
 python3 -u ${BIN_DIR}/test_wav.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type} \
diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh
index 5d811b65..ca2c2b9d 100755
--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
@@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=30
 model_type=offline
 audio_file=data/demo_002_en.wav
@@ -33,7 +34,7 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
@@ -43,5 +44,5 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
 fi
diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml
index 662d559c..72b9cb7b 100644
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@@ -1,103 +1,99 @@
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
-        causal: True
-        use_dynamic_chunk: true
-        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
-        use_dynamic_left_chunk: false
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: True
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
 
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 16
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 16
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0 
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 120
+accum_grad: 8
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-06 
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
 
-training:
-  n_epoch: 120
-  accum_grad: 8
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.001
-    weight_decay: 1e-06 
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
 
-decoding:
-  batch_size: 128
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  beam_size: 10
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: true  # simulate streaming inference. Defaults to False.
 
 
diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml
index bc77ba41..19ade8ad 100644
--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@@ -1,103 +1,89 @@
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
-        use_dynamic_chunk: true
-        use_dynamic_left_chunk: false
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 64
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 64
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
 
-training:
-  n_epoch: 120
-  accum_grad: 1
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.001
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: true  # simulate streaming inference. Defaults to False.
\ No newline at end of file
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 120
+accum_grad: 1
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
\ No newline at end of file
diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml
index 5a570897..4f7b759b 100644
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@@ -1,104 +1,96 @@
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        ctc_grad_norm_type: null 
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    ctc_grad_norm_type: null 
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
 
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean
 
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 16
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 16
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
   
 
-training:
-  n_epoch: 70
-  accum_grad: 8
-  global_grad_clip: 3.0
-  optim: adam
-  optim_conf:
-    lr: 0.004
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  beam_size: 10
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 70
+accum_grad: 8
+global_grad_clip: 3.0
+optim: adam
+optim_conf:
+  lr: 0.004
+  weight_decay: 1.0e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
 
 
diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
index b7f33e22..740ce78f 100644
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -1,110 +1,88 @@
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
 
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
-  min_input_len: 0.5  # second
-  max_input_len: 30.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 100.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 32 
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
-
-
-training:
-  n_epoch: 120 
-  accum_grad: 4
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.004
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
 
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 120 
+accum_grad: 4
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.004
+  weight_decay: 1.0e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml
new file mode 100644
index 00000000..0760e721
--- /dev/null
+++ b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml
@@ -0,0 +1,11 @@
+decode_batch_size: 128
+error_rate_type: wer
+decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: true  # simulate streaming inference. Defaults to False.
diff --git a/examples/librispeech/asr1/conf/tuning/decode.yaml b/examples/librispeech/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000..805dd02f
--- /dev/null
+++ b/examples/librispeech/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,11 @@
+decode_batch_size: 64
+error_rate_type: wer
+decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh
index c65d611c..14d91d68 100755
--- a/examples/librispeech/asr1/local/align.sh
+++ b/examples/librispeech/asr1/local/align.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 batch_size=1
 output_dir=${ckpt_prefix}
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
---opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 
 if [ $? -ne 0 ]; then
     echo "Failed in ctc alignment!"
diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh
index aa06132e..51ced18b 100755
--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
@@ -15,8 +15,8 @@ recog_set="test-clean"
 stage=0
 stop_stage=100
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         python3 -u ${BIN_DIR}/test.py \
             --ngpu ${ngpu} \
             --config ${config_path} \
+            --decode_cfg ${decode_config_path} \
             --result_file ${ckpt_prefix}.${type}.rsl \
             --checkpoint_path ${ckpt_prefix} \
-            --opts decoding.decoding_method ${type} \
-            --opts decoding.batch_size ${batch_size}
+            --opts decode.decoding_method ${type} \
+            --opts decode.decode_batch_size ${batch_size}
 
         if [ $? -ne 0 ]; then
             echo "Failed in evaluation!"
@@ -76,10 +78,11 @@ for type in ctc_greedy_search; do
     python3 -u ${BIN_DIR}/test.py \
         --ngpu ${ngpu} \
         --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
@@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     python3 -u ${BIN_DIR}/test.py \
         --ngpu ${ngpu} \
         --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/librispeech/asr1/local/test_wav.sh b/examples/librispeech/asr1/local/test_wav.sh
index ab6d685d..e70fc83c 100755
--- a/examples/librispeech/asr1/local/test_wav.sh
+++ b/examples/librispeech/asr1/local/test_wav.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-audio_file=$3
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4
 
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
@@ -49,10 +50,11 @@ for type in attention_rescoring; do
     python3 -u ${BIN_DIR}/test_wav.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
     --audio_file ${audio_file}
 
     #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh
index f839e5af..116dae12 100755
--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
@@ -8,6 +8,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=30
 audio_file=data/demo_002_en.wav
 
@@ -34,17 +35,17 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
 
 if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
diff --git a/examples/librispeech/asr2/conf/decode/decode_base.yaml b/examples/librispeech/asr2/conf/decode/decode_base.yaml
new file mode 100644
index 00000000..384ed197
--- /dev/null
+++ b/examples/librispeech/asr2/conf/decode/decode_base.yaml
@@ -0,0 +1,11 @@
+decode_batch_size: 1
+error_rate_type: wer
+decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml
index a16563a5..32d95b41 100644
--- a/examples/librispeech/asr2/conf/transformer.yaml
+++ b/examples/librispeech/asr2/conf/transformer.yaml
@@ -1,73 +1,80 @@
 # https://yaml.org/type/float.html
-# network architecture
-model:
-    cmvn_file:  
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file:  
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean
 
-collator:
-  vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
-  unit_type: spm
-  spm_model_prefix: data/lang_char/train_960_unigram5000
-  feat_dim: 83
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 30 
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
+unit_type: spm
+spm_model_prefix: data/lang_char/train_960_unigram5000
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 30 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config: conf/preprocess.yaml 
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
 
-training:
-  n_epoch: 120
-  accum_grad: 2
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 120
+accum_grad: 2
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
 
 optim: adam
 optim_conf:
@@ -79,23 +86,5 @@ scheduler_conf:
   warmup_steps: 25000
   lr_decay: 1.0
 
-decoding:
-  batch_size: 1
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
 
 
diff --git a/examples/librispeech/asr2/local/align.sh b/examples/librispeech/asr2/local/align.sh
index 626c3574..60a16f42 100755
--- a/examples/librispeech/asr2/local/align.sh
+++ b/examples/librispeech/asr2/local/align.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path dict_path ckpt_path_prefix"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path dict_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-dict_path=$2
-ckpt_prefix=$3
+decode_config_path=$2
+dict_path=$3
+ckpt_prefix=$4
 
 batch_size=1
 output_dir=${ckpt_prefix}
@@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \
 --dict-path ${dict_path} \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result-file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
---opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 
 if [ $? -ne 0 ]; then
     echo "Failed in ctc alignment!"
diff --git a/examples/librispeech/asr2/local/test.sh b/examples/librispeech/asr2/local/test.sh
index d210f2a8..8cf3b52c 100755
--- a/examples/librispeech/asr2/local/test.sh
+++ b/examples/librispeech/asr2/local/test.sh
@@ -19,6 +19,7 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe}
 bpemodel=${bpeprefix}.model
 
 config_path=conf/transformer.yaml
+decode_config_path=conf/decode/decode_base.yaml
 dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
 ckpt_prefix=
 
@@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
             --ngpu ${ngpu} \
             --dict-path ${dict} \
             --config ${config_path} \
+            --decode_cfg ${decode_config_path} \
             --checkpoint_path ${ckpt_prefix} \
             --result-file ${decode_dir}/data.JOB.json \
-            --opts decoding.decoding_method ${dmethd} \
-            --opts decoding.batch_size ${batch_size} \
-            --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
+            --opts decode.decoding_method ${dmethd} \
+            --opts decode.decode_batch_size ${batch_size} \
+            --opts test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
 
         score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict}
 
diff --git a/examples/librispeech/asr2/run.sh b/examples/librispeech/asr2/run.sh
index 5b7596f2..c9a794e3 100755
--- a/examples/librispeech/asr2/run.sh
+++ b/examples/librispeech/asr2/run.sh
@@ -9,7 +9,8 @@ gpus=0,1,2,3,4,5,6,7
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
-dict_path=lang_char/train_960_unigram5000_units.txt
+decode_conf_path=conf/decode/decode_base.yaml
+dict_path=data/lang_char/train_960_unigram5000_units.txt
 avg_num=10
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -35,7 +36,7 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # attetion resocre decoder
-    ./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    ./local/test.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
@@ -45,7 +46,7 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
diff --git a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml
index c2d69226..c2db2c7c 100644
--- a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml
+++ b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml
@@ -1,67 +1,65 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.0
-  max_input_len: 27.0 # second
-  min_output_len: 0.0
-  max_output_len: .inf
-  min_output_input_ratio: 0.00
-  max_output_input_ratio: .inf
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+min_input_len: 0.0
+max_input_len: 27.0 # second
+min_output_len: 0.0
+max_output_len: .inf
+min_output_input_ratio: 0.00
+max_output_input_ratio: .inf
 
-collator:
-  batch_size: 64 # one gpu
-  mean_std_filepath: data/mean_std.npz
-  unit_type: char
-  vocab_filepath: data/vocab.txt 
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
-  spm_model_prefix: 
-  spectrum_type: linear
-  feat_dim: 
-  delta_delta: False
-  stride_ms: 10.0
-  window_ms: 20.0
-  n_fft: None
-  max_freq: None
-  target_sample_rate: 16000
-  use_dB_normalization: True
-  target_dB: -20
-  dither: 1.0
-  keep_transcription_text: False
-  sortagrad: True
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+batch_size: 64 # one gpu
+mean_std_filepath: data/mean_std.npz
+unit_type: char
+vocab_filepath: data/vocab.txt 
+augmentation_config: conf/augmentation.json
+random_seed: 0
+spm_model_prefix: 
+spectrum_type: linear
+feat_dim: 
+delta_delta: False
+stride_ms: 10.0
+window_ms: 20.0
+n_fft: None
+max_freq: None
+target_sample_rate: 16000
+use_dB_normalization: True
+target_dB: -20
+dither: 1.0
+keep_transcription_text: False
+sortagrad: True
+shuffle_method: batch_shuffle
+num_workers: 2
 
-model:
-  num_conv_layers: 2
-  num_rnn_layers: 3
-  rnn_layer_size: 1024
-  use_gru: True 
-  share_rnn_weights: False
-  blank_id: 4333
+############################################
+#           Network Architecture           #
+############################################
+num_conv_layers: 2
+num_rnn_layers: 3
+rnn_layer_size: 1024
+use_gru: True 
+share_rnn_weights: False
+blank_id: 4333
 
-training:
-  n_epoch: 80
-  accum_grad: 1
-  lr: 2e-3
-  lr_decay: 0.83
-  weight_decay: 1e-06
-  global_grad_clip: 3.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 80
+accum_grad: 1
+lr: 2e-3
+lr_decay: 0.83
+weight_decay: 1e-06
+global_grad_clip: 3.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
+  
   
-decoding:
-  batch_size: 32
-  error_rate_type: cer 
-  decoding_method: ctc_beam_search
-  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
-  alpha: 2.6
-  beta: 5.0
-  beam_size: 300
-  cutoff_prob: 0.99
-  cutoff_top_n: 40
-  num_proc_bsearch: 8
diff --git a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml
new file mode 100644
index 00000000..b5283a93
--- /dev/null
+++ b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml
@@ -0,0 +1,10 @@
+decode_batch_size: 32
+error_rate_type: cer 
+decoding_method: ctc_beam_search
+lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
+alpha: 2.6
+beta: 5.0
+beam_size: 300
+cutoff_prob: 0.99
+cutoff_top_n: 40
+num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh
index 8cbff235..463593ef 100755
--- a/examples/other/1xt2x/aishell/local/test.sh
+++ b/examples/other/1xt2x/aishell/local/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-model_type=$3
+decode_config_path=$2
+ckpt_prefix=$3
+model_type=$4
 
 # download language model
 bash local/download_lm_ch.sh
@@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
diff --git a/examples/other/1xt2x/aishell/run.sh b/examples/other/1xt2x/aishell/run.sh
index 1ccac1c3..89a63411 100755
--- a/examples/other/1xt2x/aishell/run.sh
+++ b/examples/other/1xt2x/aishell/run.sh
@@ -5,6 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline
 gpus=2
@@ -23,6 +24,6 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
 fi
 
diff --git a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
index be51a9b9..0c08fbc6 100644
--- a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
@@ -1,67 +1,64 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
-  min_input_len: 0.0
-  max_input_len: .inf # second
-  min_output_len: 0.0
-  max_output_len: .inf
-  min_output_input_ratio: 0.00
-  max_output_input_ratio: .inf
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean
+min_input_len: 0.0
+max_input_len: .inf # second
+min_output_len: 0.0
+max_output_len: .inf
+min_output_input_ratio: 0.00
+max_output_input_ratio: .inf
 
-collator:
-  batch_size: 64 # one gpu
-  mean_std_filepath: data/mean_std.npz
-  unit_type: char
-  vocab_filepath: data/vocab.txt 
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
-  spm_model_prefix: 
-  spectrum_type: linear
-  feat_dim: 
-  delta_delta: False
-  stride_ms: 10.0
-  window_ms: 20.0
-  n_fft: None
-  max_freq: None
-  target_sample_rate: 16000
-  use_dB_normalization: True
-  target_dB: -20
-  dither: 1.0
-  keep_transcription_text: False
-  sortagrad: True
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+batch_size: 64 # one gpu
+mean_std_filepath: data/mean_std.npz
+unit_type: char
+vocab_filepath: data/vocab.txt 
+augmentation_config: conf/augmentation.json
+random_seed: 0
+spm_model_prefix: 
+spectrum_type: linear
+feat_dim: 
+delta_delta: False
+stride_ms: 10.0
+window_ms: 20.0
+n_fft: None
+max_freq: None
+target_sample_rate: 16000
+use_dB_normalization: True
+target_dB: -20
+dither: 1.0
+keep_transcription_text: False
+sortagrad: True
+shuffle_method: batch_shuffle
+num_workers: 2
 
-model:
-  num_conv_layers: 2
-  num_rnn_layers: 3
-  rnn_layer_size: 1024
-  use_gru: True
-  share_rnn_weights: False
-  blank_id: 28
+############################################
+#           Network Architecture           #
+############################################
+num_conv_layers: 2
+num_rnn_layers: 3
+rnn_layer_size: 1024
+use_gru: True
+share_rnn_weights: False
+blank_id: 28
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 80
+accum_grad: 1
+lr: 2e-3
+lr_decay: 0.83
+weight_decay: 1e-06
+global_grad_clip: 3.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
 
-training:
-  n_epoch: 80
-  accum_grad: 1
-  lr: 2e-3
-  lr_decay: 0.83
-  weight_decay: 1e-06
-  global_grad_clip: 3.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-  
-decoding:
-  batch_size: 32
-  error_rate_type: wer 
-  decoding_method: ctc_beam_search
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 1.4
-  beta: 0.35
-  beam_size: 500
-  cutoff_prob: 1.0
-  cutoff_top_n: 40
-  num_proc_bsearch: 8
diff --git a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
new file mode 100644
index 00000000..f52dde32
--- /dev/null
+++ b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
@@ -0,0 +1,10 @@
+decode_batch_size: 32
+error_rate_type: wer 
+decoding_method: ctc_beam_search
+lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+alpha: 1.4
+beta: 0.35
+beam_size: 500
+cutoff_prob: 1.0
+cutoff_top_n: 40
+num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh
index a627ef72..ea40046b 100755
--- a/examples/other/1xt2x/baidu_en8k/local/test.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-model_type=$3
+decode_config_path=$2
+ckpt_prefix=$3
+model_type=$4
 
 # download language model
 bash local/download_lm_en.sh
@@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
diff --git a/examples/other/1xt2x/baidu_en8k/run.sh b/examples/other/1xt2x/baidu_en8k/run.sh
index b7f69f6b..82de56b0 100755
--- a/examples/other/1xt2x/baidu_en8k/run.sh
+++ b/examples/other/1xt2x/baidu_en8k/run.sh
@@ -5,6 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline
 gpus=0
@@ -23,6 +24,6 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
 fi
 
diff --git a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
index ad7fb2c1..a2a5649b 100644
--- a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
+++ b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
@@ -1,67 +1,64 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
-  min_input_len: 0.0
-  max_input_len: 1000.0 # second
-  min_output_len: 0.0
-  max_output_len: .inf
-  min_output_input_ratio: 0.00
-  max_output_input_ratio: .inf
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean
+min_input_len: 0.0
+max_input_len: 1000.0 # second
+min_output_len: 0.0
+max_output_len: .inf
+min_output_input_ratio: 0.00
+max_output_input_ratio: .inf
 
-collator:
-  batch_size: 64 # one gpu
-  mean_std_filepath: data/mean_std.npz
-  unit_type: char
-  vocab_filepath: data/vocab.txt 
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
-  spm_model_prefix: 
-  spectrum_type: linear
-  feat_dim: 
-  delta_delta: False
-  stride_ms: 10.0
-  window_ms: 20.0
-  n_fft: None
-  max_freq: None
-  target_sample_rate: 16000
-  use_dB_normalization: True
-  target_dB: -20
-  dither: 1.0
-  keep_transcription_text: False
-  sortagrad: True
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+batch_size: 64 # one gpu
+mean_std_filepath: data/mean_std.npz
+unit_type: char
+vocab_filepath: data/vocab.txt 
+augmentation_config: conf/augmentation.json
+random_seed: 0
+spm_model_prefix: 
+spectrum_type: linear
+feat_dim: 
+delta_delta: False
+stride_ms: 10.0
+window_ms: 20.0
+n_fft: None
+max_freq: None
+target_sample_rate: 16000
+use_dB_normalization: True
+target_dB: -20
+dither: 1.0
+keep_transcription_text: False
+sortagrad: True
+shuffle_method: batch_shuffle
+num_workers: 2
 
-model:
-  num_conv_layers: 2
-  num_rnn_layers: 3
-  rnn_layer_size: 2048
-  use_gru: False
-  share_rnn_weights: True
-  blank_id: 28
+############################################
+#           Network Architecture           #
+############################################
+num_conv_layers: 2
+num_rnn_layers: 3
+rnn_layer_size: 2048
+use_gru: False
+share_rnn_weights: True
+blank_id: 28
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 80
+accum_grad: 1
+lr: 2e-3
+lr_decay: 0.83
+weight_decay: 1e-06
+global_grad_clip: 3.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
 
-training:
-  n_epoch: 80
-  accum_grad: 1
-  lr: 2e-3
-  lr_decay: 0.83
-  weight_decay: 1e-06
-  global_grad_clip: 3.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-  
-decoding:
-  batch_size: 32
-  error_rate_type: wer 
-  decoding_method: ctc_beam_search
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 500
-  cutoff_prob: 1.0
-  cutoff_top_n: 40
-  num_proc_bsearch: 8
diff --git a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
new file mode 100644
index 00000000..f3b51def
--- /dev/null
+++ b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
@@ -0,0 +1,10 @@
+decode_batch_size: 32
+error_rate_type: wer 
+decoding_method: ctc_beam_search
+lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+alpha: 2.5
+beta: 0.3
+beam_size: 500
+cutoff_prob: 1.0
+cutoff_top_n: 40
+num_proc_bsearch: 8
\ No newline at end of file
diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh
index a627ef72..ea40046b 100755
--- a/examples/other/1xt2x/librispeech/local/test.sh
+++ b/examples/other/1xt2x/librispeech/local/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-model_type=$3
+decode_config_path=$2
+ckpt_prefix=$3
+model_type=$4
 
 # download language model
 bash local/download_lm_en.sh
@@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
diff --git a/examples/other/1xt2x/librispeech/run.sh b/examples/other/1xt2x/librispeech/run.sh
index 8c667de2..8b614bbb 100755
--- a/examples/other/1xt2x/librispeech/run.sh
+++ b/examples/other/1xt2x/librispeech/run.sh
@@ -5,6 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline
 gpus=1
@@ -23,5 +24,5 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
 fi
diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py
index b4f9cdf9..88a13fdc 100644
--- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py
+++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 """Evaluation for DeepSpeech2 model."""
 from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester
+from yacs.config import CfgNode
 
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
 
@@ -41,9 +41,13 @@ if __name__ == "__main__":
     print("model_type:{}".format(args.model_type))
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
index ad83a41d..003b02e2 100644
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
+++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
@@ -120,20 +120,6 @@ class DeepSpeech2Model(nn.Layer):
     :rtype: tuple of LayerOutput
     """
 
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                num_conv_layers=2,  #Number of stacking convolution layers.
-                num_rnn_layers=3,  #Number of stacking RNN layers.
-                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-            ))
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self,
                  feat_size,
                  dict_size,
@@ -233,11 +219,11 @@ class DeepSpeech2Model(nn.Layer):
         """
         model = cls(feat_size=dataloader.collate_fn.feature_size,
                     dict_size=len(dataloader.collate_fn.vocab_list),
-                    num_conv_layers=config.model.num_conv_layers,
-                    num_rnn_layers=config.model.num_rnn_layers,
-                    rnn_size=config.model.rnn_layer_size,
-                    use_gru=config.model.use_gru,
-                    share_rnn_weights=config.model.share_rnn_weights)
+                    num_conv_layers=config.num_conv_layers,
+                    num_rnn_layers=config.num_rnn_layers,
+                    rnn_size=config.rnn_layer_size,
+                    use_gru=config.use_gru,
+                    share_rnn_weights=config.share_rnn_weights)
         infos = Checkpoint().load_parameters(
             model, checkpoint_path=checkpoint_path)
         logger.info(f"checkpoint info: {infos}")
@@ -250,7 +236,7 @@ class DeepSpeech2Model(nn.Layer):
         Parameters
 
         config: yacs.config.CfgNode
-            config.model
+            config
         Returns
         -------
         DeepSpeech2Model
diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py
index 82e190d8..246fb107 100644
--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py
@@ -44,27 +44,11 @@ logger = Log(__name__).getlog()
 
 
 class DeepSpeech2Trainer(Trainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # training config
-        default = CfgNode(
-            dict(
-                lr=5e-4,  # learning rate
-                lr_decay=1.0,  # learning rate decay
-                weight_decay=1e-6,  # the coeff of weight decay
-                global_grad_clip=5.0,  # the global norm clip
-                n_epoch=50,  # train epochs
-            ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
         super().__init__(config, args)
 
     def train_batch(self, batch_index, batch_data, msg):
-        train_conf = self.config.training
+        train_conf = self.config
         start = time.time()
 
         # forward
@@ -98,7 +82,7 @@ class DeepSpeech2Trainer(Trainer):
         iteration_time = time.time() - start
 
         msg += "train time: {:>.3f}s, ".format(iteration_time)
-        msg += "batch size: {}, ".format(self.config.collator.batch_size)
+        msg += "batch size: {}, ".format(self.config.batch_size)
         msg += "accum: {}, ".format(train_conf.accum_grad)
         msg += ', '.join('{}: {:>.6f}'.format(k, v)
                          for k, v in losses_np.items())
@@ -126,7 +110,7 @@ class DeepSpeech2Trainer(Trainer):
                 total_loss += float(loss) * num_utts
                 valid_losses['val_loss'].append(float(loss))
 
-            if (i + 1) % self.config.training.log_interval == 0:
+            if (i + 1) % self.config.log_interval == 0:
                 valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
                 valid_dump['val_history_loss'] = total_loss / num_seen_utts
 
@@ -146,15 +130,15 @@ class DeepSpeech2Trainer(Trainer):
     def setup_model(self):
         config = self.config.clone()
         config.defrost()
-        config.model.feat_size = self.train_loader.collate_fn.feature_size
-        #config.model.dict_size = self.train_loader.collate_fn.vocab_size
-        config.model.dict_size = len(self.train_loader.collate_fn.vocab_list)
+        config.feat_size = self.train_loader.collate_fn.feature_size
+        #config.dict_size = self.train_loader.collate_fn.vocab_size
+        config.dict_size = len(self.train_loader.collate_fn.vocab_list)
         config.freeze()
 
         if self.args.model_type == 'offline':
-            model = DeepSpeech2Model.from_config(config.model)
+            model = DeepSpeech2Model.from_config(config)
         elif self.args.model_type == 'online':
-            model = DeepSpeech2ModelOnline.from_config(config.model)
+            model = DeepSpeech2ModelOnline.from_config(config)
         else:
             raise Exception("wrong model type")
         if self.parallel:
@@ -163,17 +147,13 @@ class DeepSpeech2Trainer(Trainer):
         logger.info(f"{model}")
         layer_tools.print_params(model, logger.info)
 
-        grad_clip = ClipGradByGlobalNormWithLog(
-            config.training.global_grad_clip)
+        grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
         lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
-            learning_rate=config.training.lr,
-            gamma=config.training.lr_decay,
-            verbose=True)
+            learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
         optimizer = paddle.optimizer.Adam(
             learning_rate=lr_scheduler,
             parameters=model.parameters(),
-            weight_decay=paddle.regularizer.L2Decay(
-                config.training.weight_decay),
+            weight_decay=paddle.regularizer.L2Decay(config.weight_decay),
             grad_clip=grad_clip)
 
         self.model = model
@@ -184,59 +164,59 @@ class DeepSpeech2Trainer(Trainer):
     def setup_dataloader(self):
         config = self.config.clone()
         config.defrost()
-        config.collator.keep_transcription_text = False
+        config.keep_transcription_text = False
 
-        config.data.manifest = config.data.train_manifest
+        config.manifest = config.train_manifest
         train_dataset = ManifestDataset.from_config(config)
 
-        config.data.manifest = config.data.dev_manifest
+        config.manifest = config.dev_manifest
         dev_dataset = ManifestDataset.from_config(config)
 
-        config.data.manifest = config.data.test_manifest
+        config.manifest = config.test_manifest
         test_dataset = ManifestDataset.from_config(config)
 
         if self.parallel:
             batch_sampler = SortagradDistributedBatchSampler(
                 train_dataset,
-                batch_size=config.collator.batch_size,
+                batch_size=config.batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=True,
                 drop_last=True,
-                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
+                sortagrad=config.sortagrad,
+                shuffle_method=config.shuffle_method)
         else:
             batch_sampler = SortagradBatchSampler(
                 train_dataset,
                 shuffle=True,
-                batch_size=config.collator.batch_size,
+                batch_size=config.batch_size,
                 drop_last=True,
-                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
+                sortagrad=config.sortagrad,
+                shuffle_method=config.shuffle_method)
 
         collate_fn_train = SpeechCollator.from_config(config)
 
-        config.collator.augmentation_config = ""
+        config.augmentation_config = ""
         collate_fn_dev = SpeechCollator.from_config(config)
 
-        config.collator.keep_transcription_text = True
-        config.collator.augmentation_config = ""
+        config.keep_transcription_text = True
+        config.augmentation_config = ""
         collate_fn_test = SpeechCollator.from_config(config)
 
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
             collate_fn=collate_fn_train,
-            num_workers=config.collator.num_workers)
+            num_workers=config.num_workers)
         self.valid_loader = DataLoader(
             dev_dataset,
-            batch_size=config.collator.batch_size,
+            batch_size=config.batch_size,
             shuffle=False,
             drop_last=False,
             collate_fn=collate_fn_dev)
         self.test_loader = DataLoader(
             test_dataset,
-            batch_size=config.decoding.batch_size,
+            batch_size=config.decode.decode_batch_size,
             shuffle=False,
             drop_last=False,
             collate_fn=collate_fn_test)
@@ -250,31 +230,10 @@ class DeepSpeech2Trainer(Trainer):
 
 
 class DeepSpeech2Tester(DeepSpeech2Trainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # testing config
-        default = CfgNode(
-            dict(
-                alpha=2.5,  # Coef of LM for beam search.
-                beta=0.3,  # Coef of WC for beam search.
-                cutoff_prob=1.0,  # Cutoff probability for pruning.
-                cutoff_top_n=40,  # Cutoff number for pruning.
-                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
-                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
-                num_proc_bsearch=8,  # # of CPUs for beam search.
-                beam_size=500,  # Beam search width.
-                batch_size=128,  # decoding batch size
-            ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
 
         self._text_featurizer = TextFeaturizer(
-            unit_type=config.collator.unit_type, vocab_filepath=None)
+            unit_type=config.unit_type, vocab=None)
         super().__init__(config, args)
 
     def ordid2token(self, texts, texts_len):
@@ -293,7 +252,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
                         texts,
                         texts_len,
                         fout=None):
-        cfg = self.config.decoding
+        cfg = self.config.decode
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
         error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
@@ -399,31 +358,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
             self.export()
         except KeyboardInterrupt:
             exit(-1)
-
-    def setup(self):
-        """Setup the experiment.
-        """
-        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
-
-        self.setup_output_dir()
-        self.setup_checkpointer()
-
-        self.setup_dataloader()
-        self.setup_model()
-
-        self.iteration = 0
-        self.epoch = 0
-
-    def setup_output_dir(self):
-        """Create a directory used for output.
-        """
-        # output dir
-        if self.args.output:
-            output_dir = Path(self.args.output).expanduser()
-            output_dir.mkdir(parents=True, exist_ok=True)
-        else:
-            output_dir = Path(
-                self.args.checkpoint_path).expanduser().parent.parent
-            output_dir.mkdir(parents=True, exist_ok=True)
-
-        self.output_dir = output_dir
diff --git a/examples/ted_en_zh/st0/conf/preprocess.yaml b/examples/ted_en_zh/st0/conf/preprocess.yaml
new file mode 100644
index 00000000..d3992cb9
--- /dev/null
+++ b/examples/ted_en_zh/st0/conf/preprocess.yaml
@@ -0,0 +1,25 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: 0.1
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml
index 8afb107b..d113fc94 100644
--- a/examples/ted_en_zh/st0/conf/transformer.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer.yaml
@@ -1,114 +1,98 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.05  # second
-  max_input_len: 30.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.01
-  max_output_input_ratio: 20.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+min_input_len: 0.05  # second
+max_input_len: 30.0 # second
+min_output_len: 0.0 # tokens
+max_output_len: 400.0 # tokens
+min_output_input_ratio: 0.01
+max_output_input_ratio: 20.0
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt
-  unit_type: 'spm'
-  spm_model_prefix: data/lang_char/bpe_unigram_8000
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 16
-  maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_char/bpe_unigram_8000
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+batch_size: 16
+maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+raw_wav: True  # use raw_wav or kaldi feature
+spectrum_type: fbank #linear, mfcc, fbank
+feat_dim: 80
+delta_delta: False
+dither: 1.0
+target_sample_rate: 16000
+max_freq: None
+n_fft: None
+stride_ms: 10.0
+window_ms: 25.0
+use_dB_normalization: True
+target_dB: -20
+random_seed: 0
+keep_transcription_text: False
+sortagrad: True 
+shuffle_method: batch_shuffle
+num_workers: 2
 
 
-# network architecture
-model:
-    cmvn_file: "data/mean_std.json"
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: "data/mean_std.json"
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
-
-    # hybrid CTC/attention
-    model_conf:
-        asr_weight: 0.0
-        ctc_weight: 0.0
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
-
-
-training:
-  n_epoch: 120
-  accum_grad: 2
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 2.5
-    weight_decay: 1e-06
-  scheduler: noam    
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 50
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 5
-  error_rate_type: char-bleu
-  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  word_reward: 0.7
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 120
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 1.0e-06
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
index 017230fe..a01ec1a6 100644
--- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
@@ -1,114 +1,102 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.05  # second
-  max_input_len: 30.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.01
-  max_output_input_ratio: 20.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+min_input_len: 0.05  # second
+max_input_len: 30.0 # second
+min_output_len: 0.0 # tokens
+max_output_len: 400.0 # tokens
+min_output_input_ratio: 0.01
+max_output_input_ratio: 20.0
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt
-  unit_type: 'spm'
-  spm_model_prefix: data/lang_char/bpe_unigram_8000
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 16
-  maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_char/bpe_unigram_8000
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+batch_size: 16
+maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+raw_wav: True  # use raw_wav or kaldi feature
+spectrum_type: fbank #linear, mfcc, fbank
+feat_dim: 80
+delta_delta: False
+dither: 1.0
+target_sample_rate: 16000
+max_freq: None
+n_fft: None
+stride_ms: 10.0
+window_ms: 25.0
+use_dB_normalization: True
+target_dB: -20
+random_seed: 0
+keep_transcription_text: False
+sortagrad: True 
+shuffle_method: batch_shuffle
+num_workers: 2
 
 
-# network architecture
-model:
-    cmvn_file: "data/mean_std.json"
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: "data/mean_std.json"
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        asr_weight: 0.5
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.5
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
 
-training:
-  n_epoch: 120
-  accum_grad: 2
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 2.5
-    weight_decay: 1e-06
-  scheduler: noam    
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 50
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 120
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 1.0e-06
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
 
 
-decoding:
-  batch_size: 5
-  error_rate_type: char-bleu
-  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  word_reward: 0.7
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
-
 
diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml
new file mode 100644
index 00000000..ed081cf4
--- /dev/null
+++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml
@@ -0,0 +1,11 @@
+batch_size: 5
+error_rate_type: char-bleu
+decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
+beam_size: 10
+word_reward: 0.7
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/ted_en_zh/st0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh
index 0796a06e..904f95c4 100755
--- a/examples/ted_en_zh/st0/local/test.sh
+++ b/examples/ted_en_zh/st0/local/test.sh
@@ -1,7 +1,7 @@
 #! /usr/bin/env bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,16 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 for type in fullsentence; do
     echo "decoding ${type}"
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/ted_en_zh/st0/run.sh b/examples/ted_en_zh/st0/run.sh
index b85ba95a..1746c025 100755
--- a/examples/ted_en_zh/st0/run.sh
+++ b/examples/ted_en_zh/st0/run.sh
@@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/transformer_mtl_noam.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -32,7 +33,7 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
diff --git a/examples/ted_en_zh/st1/conf/preprocess.yaml b/examples/ted_en_zh/st1/conf/preprocess.yaml
new file mode 100644
index 00000000..bc86d98c
--- /dev/null
+++ b/examples/ted_en_zh/st1/conf/preprocess.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml
index a8918a23..515edee2 100644
--- a/examples/ted_en_zh/st1/conf/transformer.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer.yaml
@@ -1,104 +1,90 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
-collator:
-  vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
-  unit_type: 'spm'
-  spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
-  mean_std_filepath: ""
-  # augmentation_config: conf/augmentation.json
-  batch_size: 20
-  feat_dim: 83
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config:
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
 
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
 
-# network architecture
-model:
-    cmvn_file: None
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.0
+    ctc_weight: 0.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
-    # hybrid CTC/attention
-    model_conf:
-        asr_weight: 0.0
-        ctc_weight: 0.0
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
 
-
-training:
-  n_epoch: 40
-  accum_grad: 2
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 2.5
-    weight_decay: 0.
-  scheduler: noam    
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 50
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 5
-  error_rate_type: char-bleu
-  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  word_reward: 0.7
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
index 3787037f..a5f956fa 100644
--- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@@ -1,104 +1,90 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
-collator:
-  vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
-  unit_type: 'spm'
-  spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
-  mean_std_filepath: ""
-  # augmentation_config: conf/augmentation.json
-  batch_size: 20
-  feat_dim: 83
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config:
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
+unit_type: 'spm'
+spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
+mean_std_filepath: ""
+# preprocess_config: conf/augmentation.json
+batch_size: 20
+feat_dim: 83
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config:
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
 
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: None
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
 
-# network architecture
-model:
-    cmvn_file: None
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    asr_weight: 0.5
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
-    # hybrid CTC/attention
-    model_conf:
-        asr_weight: 0.5
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
 
-
-training:
-  n_epoch: 40
-  accum_grad: 2
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 2.5
-    weight_decay: 0.
-  scheduler: noam    
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 50
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 5
-  error_rate_type: char-bleu
-  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  word_reward: 0.7
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 40
+accum_grad: 2
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 2.5
+  weight_decay: 0.
+scheduler: noam    
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 50
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml
new file mode 100644
index 00000000..d6104dbc
--- /dev/null
+++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml
@@ -0,0 +1,12 @@
+
+batch_size: 5
+error_rate_type: char-bleu
+decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
+beam_size: 10
+word_reward: 0.7
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh
index 0796a06e..904f95c4 100755
--- a/examples/ted_en_zh/st1/local/test.sh
+++ b/examples/ted_en_zh/st1/local/test.sh
@@ -1,7 +1,7 @@
 #! /usr/bin/env bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,16 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 for type in fullsentence; do
     echo "decoding ${type}"
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh
index f6362a8b..1808e37b 100755
--- a/examples/ted_en_zh/st1/run.sh
+++ b/examples/ted_en_zh/st1/run.sh
@@ -7,6 +7,7 @@ gpus=0,1,2,3
 stage=1
 stop_stage=4
 conf_path=conf/transformer_mtl_noam.yaml
+decode_conf_path=conf/tuning/decode.yaml
 ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
@@ -27,7 +28,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     if [ -n "${ckpt_path}" ]; then
         echo "Finetune from Pretrained Model" ${ckpt_path}
         ./local/download_pretrain.sh || exit -1
-    fi 
+    fi
     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
 fi
 
@@ -38,5 +39,5 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
-fi
\ No newline at end of file
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml
index 1c6059e4..4731395f 100644
--- a/examples/timit/asr1/conf/transformer.yaml
+++ b/examples/timit/asr1/conf/transformer.yaml
@@ -1,110 +1,89 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.0  # second
-  max_input_len: 10.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 150.0 # tokens
-  min_output_input_ratio: 0.005
-  max_output_input_ratio: 1000.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt
-  unit_type: "word"
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 64
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt
+spm_model_prefix: ''
+unit_type: "word"
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 64
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
 
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 128    # dimension of attention
-        attention_heads: 4
-        linear_units: 1024  # the number of units of position-wise feed forward
-        num_blocks: 6      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 128    # dimension of attention
+    attention_heads: 4
+    linear_units: 1024  # the number of units of position-wise feed forward
+    num_blocks: 6      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 1024
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 1024
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.5
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.5
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
 
-training:
-  n_epoch: 50
-  accum_grad: 1
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.004
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 1200
-    lr_decay: 1.0
-  log_interval: 10
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
-
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 50
+accum_grad: 1
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.004
+  weight_decay: 1.0e-6
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 1200
+  lr_decay: 1.0
+log_interval: 10
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
 
diff --git a/examples/timit/asr1/conf/tuning/decode.yaml b/examples/timit/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000..805dd02f
--- /dev/null
+++ b/examples/timit/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,11 @@
+decode_batch_size: 64
+error_rate_type: wer
+decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
diff --git a/examples/timit/asr1/local/align.sh b/examples/timit/asr1/local/align.sh
index c65d611c..14d91d68 100755
--- a/examples/timit/asr1/local/align.sh
+++ b/examples/timit/asr1/local/align.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 batch_size=1
 output_dir=${ckpt_prefix}
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
---opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 
 if [ $? -ne 0 ]; then
     echo "Failed in ctc alignment!"
diff --git a/examples/timit/asr1/local/test.sh b/examples/timit/asr1/local/test.sh
index 08ee0e36..88192c58 100755
--- a/examples/timit/asr1/local/test.sh
+++ b/examples/timit/asr1/local/test.sh
@@ -7,8 +7,8 @@ stop_stage=50
 
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         python3 -u ${BIN_DIR}/test.py \
         --ngpu ${ngpu} \
         --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
     
         if [ $? -ne 0 ]; then
             echo "Failed in evaluation!"
@@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         python3 -u ${BIN_DIR}/test.py \
         --ngpu ${ngpu}  \
         --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
     
         if [ $? -ne 0 ]; then
             echo "Failed in evaluation!"
@@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         python3 -u ${BIN_DIR}/test.py \
         --ngpu ${ngpu}  \
         --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
         --result_file ${ckpt_prefix}.${type}.rsl \
         --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
     
         if [ $? -ne 0 ]; then
             echo "Failed in evaluation!"
diff --git a/examples/timit/asr1/run.sh b/examples/timit/asr1/run.sh
index a95b5f3a..0d84be9f 100755
--- a/examples/timit/asr1/run.sh
+++ b/examples/timit/asr1/run.sh
@@ -7,6 +7,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=10
 TIMIT_path=/path/to/TIMIT
 
@@ -34,15 +35,15 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
-# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-#     # export ckpt avg_n
-#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-# fi
+if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
+     # export ckpt avg_n
+     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi
diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml
index 7d841d47..a16a79d3 100644
--- a/examples/tiny/asr0/conf/deepspeech2.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2.yaml
@@ -1,70 +1,67 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny 
-  min_input_len: 0.0
-  max_input_len: 30.0
-  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.tiny
+dev_manifest: data/manifest.tiny
+test_manifest: data/manifest.tiny 
+min_input_len: 0.0
+max_input_len: 30.0
+min_output_len: 0.0
+max_output_len: 400.0
+min_output_input_ratio: 0.05
+max_output_input_ratio: 10.0
 
 
-collator:
-  mean_std_filepath: data/mean_std.json
-  unit_type: char
-  vocab_filepath: data/lang_char/vocab.txt
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
-  spm_model_prefix: 
-  spectrum_type: linear
-  feat_dim: 
-  delta_delta: False
-  stride_ms: 10.0
-  window_ms: 20.0
-  n_fft: None
-  max_freq: None
-  target_sample_rate: 16000
-  use_dB_normalization: True
-  target_dB: -20
-  dither: 1.0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-  batch_size: 4
+###########################################
+#              Dataloader                 #
+###########################################
+mean_std_filepath: data/mean_std.json
+unit_type: char
+vocab_filepath: data/lang_char/vocab.txt
+augmentation_config: conf/augmentation.json
+random_seed: 0
+spm_model_prefix: 
+spectrum_type: linear
+feat_dim: 
+delta_delta: False
+stride_ms: 10.0
+window_ms: 20.0
+n_fft: None
+max_freq: None
+target_sample_rate: 16000
+use_dB_normalization: True
+target_dB: -20
+dither: 1.0
+keep_transcription_text: False
+sortagrad: True 
+shuffle_method: batch_shuffle
+num_workers: 2
+batch_size: 4
   
-model:
-  num_conv_layers: 2
-  num_rnn_layers: 3
-  rnn_layer_size: 2048
-  use_gru: False 
-  share_rnn_weights: True 
-  blank_id: 0
+############################################
+#           Network Architecture           #
+############################################
+num_conv_layers: 2
+num_rnn_layers: 3
+rnn_layer_size: 2048
+use_gru: False 
+share_rnn_weights: True 
+blank_id: 0
   
 
-training:
-  n_epoch: 5
-  accum_grad: 1
-  lr: 1e-5 
-  lr_decay: 0.8 
-  weight_decay: 1e-06
-  global_grad_clip: 5.0
-  log_interval: 1
-  checkpoint:
-    kbest_n: 3
-    latest_n: 2
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 5
+accum_grad: 1
+lr: 1e-5 
+lr_decay: 0.8 
+weight_decay: 1e-06
+global_grad_clip: 5.0
+log_interval: 1
+checkpoint:
+  kbest_n: 3
+  latest_n: 2
 
 
-decoding:
-  batch_size: 128
-  error_rate_type: wer
-  decoding_method: ctc_beam_search
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 500
-  cutoff_prob: 1.0
-  cutoff_top_n: 40
-  num_proc_bsearch: 8
diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml
index 393b6439..5458cfb3 100644
--- a/examples/tiny/asr0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml
@@ -1,72 +1,68 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny 
-  min_input_len: 0.0
-  max_input_len: 30.0
-  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.tiny
+dev_manifest: data/manifest.tiny
+test_manifest: data/manifest.tiny 
+min_input_len: 0.0
+max_input_len: 30.0
+min_output_len: 0.0
+max_output_len: 400.0
+min_output_input_ratio: 0.05
+max_output_input_ratio: 10.0
 
 
-collator:
-  mean_std_filepath: data/mean_std.json
-  unit_type: char
-  vocab_filepath: data/lang_char/vocab.txt
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
-  spm_model_prefix: 
-  spectrum_type: linear
-  feat_dim: 
-  delta_delta: False
-  stride_ms: 10.0
-  window_ms: 20.0
-  n_fft: None
-  max_freq: None
-  target_sample_rate: 16000
-  use_dB_normalization: True
-  target_dB: -20
-  dither: 1.0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 0
-  batch_size: 4
+###########################################
+#              Dataloader                 #
+###########################################
+mean_std_filepath: data/mean_std.json
+unit_type: char
+vocab_filepath: data/lang_char/vocab.txt
+augmentation_config: conf/augmentation.json
+random_seed: 0
+spm_model_prefix: 
+spectrum_type: linear
+feat_dim: 
+delta_delta: False
+stride_ms: 10.0
+window_ms: 20.0
+n_fft: None
+max_freq: None
+target_sample_rate: 16000
+use_dB_normalization: True
+target_dB: -20
+dither: 1.0
+keep_transcription_text: False
+sortagrad: True 
+shuffle_method: batch_shuffle
+num_workers: 0
+batch_size: 4
   
-model:
-  num_conv_layers: 2
-  num_rnn_layers: 4
-  rnn_layer_size: 2048
-  rnn_direction: forward
-  num_fc_layers: 2
-  fc_layers_size_list: 512, 256
-  use_gru: True 
-  blank_id: 0
+############################################
+#           Network Architecture           #
+############################################
+num_conv_layers: 2
+num_rnn_layers: 4
+rnn_layer_size: 2048
+rnn_direction: forward
+num_fc_layers: 2
+fc_layers_size_list: 512, 256
+use_gru: True 
+blank_id: 0
   
 
-training:
-  n_epoch: 5
-  accum_grad: 1
-  lr: 1e-5 
-  lr_decay: 1.0 
-  weight_decay: 1e-06
-  global_grad_clip: 5.0
-  log_interval: 1
-  checkpoint:
-    kbest_n: 3
-    latest_n: 2
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 5
+accum_grad: 1
+lr: 1e-5 
+lr_decay: 1.0 
+weight_decay: 1e-06
+global_grad_clip: 5.0
+log_interval: 1
+checkpoint:
+  kbest_n: 3
+  latest_n: 2
 
-
-decoding:
-  batch_size: 128
-  error_rate_type: wer
-  decoding_method: ctc_beam_search
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 500
-  cutoff_prob: 1.0
-  cutoff_top_n: 40
-  num_proc_bsearch: 8
diff --git a/examples/tiny/asr0/conf/tuning/chunk_decode.yaml b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml
new file mode 100644
index 00000000..94c3dbde
--- /dev/null
+++ b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml
@@ -0,0 +1,10 @@
+decode_batch_size: 128
+error_rate_type: wer
+decoding_method: ctc_beam_search
+lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+alpha: 2.5
+beta: 0.3
+beam_size: 500
+cutoff_prob: 1.0
+cutoff_top_n: 40
+num_proc_bsearch: 8
diff --git a/examples/tiny/asr0/conf/tuning/decode.yaml b/examples/tiny/asr0/conf/tuning/decode.yaml
new file mode 100644
index 00000000..94c3dbde
--- /dev/null
+++ b/examples/tiny/asr0/conf/tuning/decode.yaml
@@ -0,0 +1,10 @@
+decode_batch_size: 128
+error_rate_type: wer
+decoding_method: ctc_beam_search
+lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+alpha: 2.5
+beta: 0.3
+beam_size: 500
+cutoff_prob: 1.0
+cutoff_top_n: 40
+num_proc_bsearch: 8
diff --git a/examples/tiny/asr0/local/test.sh b/examples/tiny/asr0/local/test.sh
index a627ef72..ea40046b 100755
--- a/examples/tiny/asr0/local/test.sh
+++ b/examples/tiny/asr0/local/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-model_type=$3
+decode_config_path=$2
+ckpt_prefix=$3
+model_type=$4
 
 # download language model
 bash local/download_lm_en.sh
@@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
diff --git a/examples/tiny/asr0/run.sh b/examples/tiny/asr0/run.sh
index f39fb3fa..25f04624 100755
--- a/examples/tiny/asr0/run.sh
+++ b/examples/tiny/asr0/run.sh
@@ -6,6 +6,7 @@ gpus=0
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline
 
@@ -32,7 +33,7 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml
index ad27478d..8f785121 100644
--- a/examples/tiny/asr1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@@ -1,120 +1,97 @@
-# https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  min_input_len: 0.5  # second
-  max_input_len: 30.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
-  
-collator:
-  mean_std_filepath: ""
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 4
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
-
-# network architecture
-model:
-    cmvn_file: "data/mean_std.json"
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
-        causal: True
-        use_dynamic_chunk: True
-        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
-        use_dynamic_left_chunk: false
-
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: "data/mean_std.json"
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: True
+    use_dynamic_chunk: True
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
-training:
-  n_epoch: 5
-  accum_grad: 1
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.001
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 1
-  checkpoint:
-    kbest_n: 10
-    latest_n: 1
 
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.tiny
+dev_manifest: data/manifest.tiny
+test_manifest: data/manifest.tiny
 
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
-
+  
+###########################################
+#              Dataloader                 #
+###########################################
+mean_std_filepath: ""
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_200'
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 4
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+preprocess_config: conf/preprocess.yaml 
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+  
 
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 5
+accum_grad: 1
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 1
+checkpoint:
+  kbest_n: 10
+  latest_n: 1
diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml
index 298518fb..2570bb85 100644
--- a/examples/tiny/asr1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@@ -1,113 +1,91 @@
-# https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  min_input_len: 0.5  # second
-  max_input_len: 20.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
-  
-collator:
-  mean_std_filepath: ""
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 4
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
-
-# network architecture
-model:
-    cmvn_file: "data/mean_std.json"
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
-        use_dynamic_chunk: true
-        use_dynamic_left_chunk: false
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: "data/mean_std.json"
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
 
-training:
-  n_epoch: 5
-  accum_grad: 1
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.002
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 1
-  checkpoint:
-    kbest_n: 10
-    latest_n: 1
-
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.tiny
+dev_manifest: data/manifest.tiny
+test_manifest: data/manifest.tiny
+  
+###########################################
+#              Dataloader                 #
+###########################################
+mean_std_filepath: ""
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_200'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 4
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
 
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 5
+accum_grad: 1
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 1
+checkpoint:
+  kbest_n: 10
+  latest_n: 1
 
diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml
index eb850902..eb8f0ab9 100644
--- a/examples/tiny/asr1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@@ -1,116 +1,97 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  min_input_len: 0.5  # second
-  max_input_len: 20.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
-  
-collator:
-  mean_std_filepath: ""
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 4
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: "data/mean_std.json"
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
 
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-# network architecture
-model:
-    cmvn_file: "data/mean_std.json"
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.tiny
+dev_manifest: data/manifest.tiny
+test_manifest: data/manifest.tiny
+  
 
+###########################################
+#              Dataloader                 #
+###########################################
+mean_std_filepath: ""
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_200'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 4
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
-training:
-  n_epoch: 5
-  accum_grad: 4
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.002
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 1
-  checkpoint:
-    kbest_n: 10
-    latest_n: 1
 
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 5
+accum_grad: 4
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 1
+checkpoint:
+  kbest_n: 10
+  latest_n: 1
 
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
 
 
diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml
index c641d1f5..4e3068d1 100644
--- a/examples/tiny/asr1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@@ -1,110 +1,90 @@
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  min_input_len: 0.5  # second
-  max_input_len: 20.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
-  
-collator:
-  mean_std_filepath: data/mean_std.json
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 4
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
-
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.tiny
+dev_manifest: data/manifest.tiny
+test_manifest: data/manifest.tiny
+  
+###########################################
+#              Dataloader                 #
+###########################################
+mean_std_filepath: data/mean_std.json
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_200'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 4
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
-training:
-  n_epoch: 5
-  accum_grad: 1
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.002
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 1
-  checkpoint:
-    kbest_n: 2
-    latest_n: 1
 
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 5
+accum_grad: 1
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 1
+checkpoint:
+  kbest_n: 2
+  latest_n: 1
 
-decoding:
-  batch_size: 8 #64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
 
 
diff --git a/examples/tiny/asr1/conf/tuning/chunk_decode.yaml b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml
new file mode 100644
index 00000000..c5b641da
--- /dev/null
+++ b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml
@@ -0,0 +1,11 @@
+decode_batch_size: 8 #64
+error_rate_type: wer
+decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/tiny/asr1/conf/tuning/decode.yaml b/examples/tiny/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000..a0984f9e
--- /dev/null
+++ b/examples/tiny/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,11 @@
+decode_batch_size: 8 #64
+error_rate_type: wer
+decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
diff --git a/examples/tiny/asr1/local/align.sh b/examples/tiny/asr1/local/align.sh
index c65d611c..14d91d68 100755
--- a/examples/tiny/asr1/local/align.sh
+++ b/examples/tiny/asr1/local/align.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 batch_size=1
 output_dir=${ckpt_prefix}
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
---opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 
 if [ $? -ne 0 ]; then
     echo "Failed in ctc alignment!"
diff --git a/examples/tiny/asr1/local/test.sh b/examples/tiny/asr1/local/test.sh
index 190bacff..79df969b 100755
--- a/examples/tiny/asr1/local/test.sh
+++ b/examples/tiny/asr1/local/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
@@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/tiny/asr1/run.sh b/examples/tiny/asr1/run.sh
index ec9c5a56..1651c034 100755
--- a/examples/tiny/asr1/run.sh
+++ b/examples/tiny/asr1/run.sh
@@ -6,6 +6,7 @@ gpus=0
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -31,12 +32,12 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml
index a438236d..6c2bbca4 100644
--- a/examples/wenetspeech/asr1/conf/conformer.yaml
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
@@ -1,111 +1,92 @@
-# network architecture
-model:
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 512    # dimension of attention
-        attention_heads: 8
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        cnn_module_norm: layer_norm
-        activation_type: swish
-        pos_enc_layer_type: rel_pos
-        selfattention_layer_type: rel_selfattn
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    cnn_module_norm: layer_norm
+    activation_type: swish
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
 
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 8
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
 
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
 
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
-  min_input_len: 0.1 # second
-  max_input_len: 12.0 # second
-  min_output_len: 1.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
 
-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'char'
-  spm_model_prefix: ''
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 64
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True 
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+spm_model_prefix: ''
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 64
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
 
 
-training:
-  n_epoch: 240 
-  accum_grad: 16
-  global_grad_clip: 5.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-  optim: adam
-  optim_conf:
-    lr: 0.001
-    weight_decay: 1e-6
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 5000
-    lr_decay: 1.0
-
-
-decoding:
-  batch_size: 128
-  error_rate_type: cer 
-  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 16
+global_grad_clip: 5.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 5000
+  lr_decay: 1.0
diff --git a/examples/wenetspeech/asr1/conf/tuning/decode.yaml b/examples/wenetspeech/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000..6924bfa6
--- /dev/null
+++ b/examples/wenetspeech/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,11 @@
+decode_batch_size: 128
+error_rate_type: cer 
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh
index da159de7..65b884e5 100755
--- a/examples/wenetspeech/asr1/local/test.sh
+++ b/examples/wenetspeech/asr1/local/test.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
     exit -1
 fi
 
@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3
 
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
@@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     python3 -u ${BIN_DIR}/test.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh
index 5c779474..47464262 100755
--- a/examples/wenetspeech/asr1/local/test_wav.sh
+++ b/examples/wenetspeech/asr1/local/test_wav.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
     exit -1
 fi
 
@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
 config_path=$1
-ckpt_prefix=$2
-audio_file=$3
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4
 
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
@@ -43,10 +44,11 @@ for type in  attention_rescoring; do
     python3 -u ${BIN_DIR}/test_wav.py \
     --ngpu ${ngpu} \
     --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
     --audio_file ${audio_file}
 
     if [ $? -ne 0 ]; then
diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh
index d77f409f..9995bc63 100644
--- a/examples/wenetspeech/asr1/run.sh
+++ b/examples/wenetspeech/asr1/run.sh
@@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7
 stage=0
 stop_stage=100
 conf_path=conf/conformer.yaml
-
+decode_conf_path=conf/tuning/decode.yaml
 average_checkpoint=true
 avg_num=10
 
@@ -36,12 +36,12 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@@ -51,5 +51,5 @@ fi
 
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
     # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py
index 3e9939f0..88955eac 100644
--- a/paddlespeech/s2t/decoders/recog.py
+++ b/paddlespeech/s2t/decoders/recog.py
@@ -85,7 +85,7 @@ def recog_v2(args):
         mode="asr",
         load_output=False,
         sort_in_input_length=False,
-        preprocess_conf=confs.collator.augmentation_config
+        preprocess_conf=confs.preprocess_config
         if args.preprocess_conf is None else args.preprocess_conf,
         preprocess_args={"train": False}, )
 
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
index 7ccb3a6c..ccb85906 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
@@ -20,7 +20,7 @@ from paddle.inference import Config
 from paddle.inference import create_predictor
 from paddle.io import DataLoader
 
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
@@ -80,13 +80,13 @@ def inference(config, args):
 def start_server(config, args):
     """Start the ASR server"""
     config.defrost()
-    config.data.manifest = config.data.test_manifest
+    config.manifest = config.test_manifest
     dataset = ManifestDataset.from_config(config)
 
-    config.collator.augmentation_config = ""
-    config.collator.keep_transcription_text = True
-    config.collator.batch_size = 1
-    config.collator.num_workers = 0
+    config.augmentation_config = ""
+    config.keep_transcription_text = True
+    config.batch_size = 1
+    config.num_workers = 0
     collate_fn = SpeechCollator.from_config(config)
     test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
 
@@ -105,14 +105,14 @@ def start_server(config, args):
             paddle.to_tensor(audio),
             paddle.to_tensor(audio_len),
             vocab_list=test_loader.collate_fn.vocab_list,
-            decoding_method=config.decoding.decoding_method,
-            lang_model_path=config.decoding.lang_model_path,
-            beam_alpha=config.decoding.alpha,
-            beam_beta=config.decoding.beta,
-            beam_size=config.decoding.beam_size,
-            cutoff_prob=config.decoding.cutoff_prob,
-            cutoff_top_n=config.decoding.cutoff_top_n,
-            num_processes=config.decoding.num_proc_bsearch)
+            decoding_method=config.decode.decoding_method,
+            lang_model_path=config.decode.lang_model_path,
+            beam_alpha=config.decode.alpha,
+            beam_beta=config.decode.beta,
+            beam_size=config.decode.beam_size,
+            cutoff_prob=config.decode.cutoff_prob,
+            cutoff_top_n=config.decode.cutoff_top_n,
+            num_processes=config.decode.num_proc_bsearch)
         return result_transcript[0]
 
     # warming up with utterrances sampled from Librispeech
@@ -176,15 +176,19 @@ if __name__ == "__main__":
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
     print(config)
 
-    args.warmup_manifest = config.data.test_manifest
+    args.warmup_manifest = config.test_manifest
     print_arguments(args, globals())
 
     if args.dump_config:
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
index 5c6eee3f..85c2466f 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle
 from paddle.io import DataLoader
 
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
@@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments
 def start_server(config, args):
     """Start the ASR server"""
     config.defrost()
-    config.data.manifest = config.data.test_manifest
+    config.manifest = config.test_manifest
     dataset = ManifestDataset.from_config(config)
 
-    config.collator.augmentation_config = ""
-    config.collator.keep_transcription_text = True
-    config.collator.batch_size = 1
-    config.collator.num_workers = 0
+    config.augmentation_config = ""
+    config.keep_transcription_text = True
+    config.batch_size = 1
+    config.num_workers = 0
     collate_fn = SpeechCollator.from_config(config)
     test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
 
@@ -62,14 +62,14 @@ def start_server(config, args):
             paddle.to_tensor(audio),
             paddle.to_tensor(audio_len),
             vocab_list=test_loader.collate_fn.vocab_list,
-            decoding_method=config.decoding.decoding_method,
-            lang_model_path=config.decoding.lang_model_path,
-            beam_alpha=config.decoding.alpha,
-            beam_beta=config.decoding.beta,
-            beam_size=config.decoding.beam_size,
-            cutoff_prob=config.decoding.cutoff_prob,
-            cutoff_top_n=config.decoding.cutoff_top_n,
-            num_processes=config.decoding.num_proc_bsearch)
+            decoding_method=config.decode.decoding_method,
+            lang_model_path=config.decode.lang_model_path,
+            beam_alpha=config.decode.alpha,
+            beam_beta=config.decode.beta,
+            beam_size=config.decode.beam_size,
+            cutoff_prob=config.decode.cutoff_prob,
+            cutoff_top_n=config.decode.cutoff_top_n,
+            num_processes=config.decode.num_proc_bsearch)
         return result_transcript[0]
 
     # warming up with utterrances sampled from Librispeech
@@ -111,15 +111,19 @@ if __name__ == "__main__":
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
     print(config)
 
-    args.warmup_manifest = config.data.test_manifest
+    args.warmup_manifest = config.test_manifest
     print_arguments(args, globals())
 
     if args.dump_config:
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
index 66042e84..090b5fab 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Export for DeepSpeech2 model."""
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -41,7 +41,7 @@ if __name__ == "__main__":
     print_arguments(args)
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
     if args.opts:
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
index f52615fa..388b380d 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Evaluation for DeepSpeech2 model."""
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
+
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -41,9 +42,13 @@ if __name__ == "__main__":
     print("model_type:{}".format(args.model_type))
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
index e073ebbf..176028ed 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Evaluation for DeepSpeech2 model."""
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -46,9 +46,13 @@ if __name__ == "__main__":
     print("model_type:{}".format(args.model_type))
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
index cf2ca0d6..e2cb7e2f 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@@ -18,6 +18,7 @@ from pathlib import Path
 
 import paddle
 import soundfile
+from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
@@ -41,7 +42,7 @@ class DeepSpeech2Tester_hub():
         self.audio_file = args.audio_file
         self.collate_fn_test = SpeechCollator.from_config(config)
         self._text_featurizer = TextFeaturizer(
-            unit_type=config.collator.unit_type, vocab=None)
+            unit_type=config.unit_type, vocab=None)
 
     def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
         result_transcripts = self.model.decode(
@@ -74,7 +75,7 @@ class DeepSpeech2Tester_hub():
         audio = paddle.unsqueeze(audio, axis=0)
         vocab_list = collate_fn_test.vocab_list
         result_transcripts = self.compute_result_transcripts(
-            audio, audio_len, vocab_list, cfg.decoding)
+            audio, audio_len, vocab_list, cfg.decode)
         logger.info("result_transcripts: " + result_transcripts[0])
 
     def run_test(self):
@@ -110,13 +111,13 @@ class DeepSpeech2Tester_hub():
     def setup_model(self):
         config = self.config.clone()
         with UpdateConfig(config):
-            config.model.input_dim = self.collate_fn_test.feature_size
-            config.model.output_dim = self.collate_fn_test.vocab_size
+            config.input_dim = self.collate_fn_test.feature_size
+            config.output_dim = self.collate_fn_test.vocab_size
 
         if self.args.model_type == 'offline':
-            model = DeepSpeech2Model.from_config(config.model)
+            model = DeepSpeech2Model.from_config(config)
         elif self.args.model_type == 'online':
-            model = DeepSpeech2ModelOnline.from_config(config.model)
+            model = DeepSpeech2ModelOnline.from_config(config)
         else:
             raise Exception("wrong model type")
 
@@ -134,8 +135,8 @@ class DeepSpeech2Tester_hub():
         self.checkpoint_dir = checkpoint_dir
 
         self.checkpoint = Checkpoint(
-            kbest_n=self.config.training.checkpoint.kbest_n,
-            latest_n=self.config.training.checkpoint.latest_n)
+            kbest_n=self.config.checkpoint.kbest_n,
+            latest_n=self.config.checkpoint.latest_n)
 
     def resume(self):
         """Resume from the checkpoint at checkpoints in the output
@@ -187,9 +188,13 @@ if __name__ == "__main__":
     print("model_type:{}".format(args.model_type))
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
index 400538f9..5e8c0fff 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@@ -14,7 +14,7 @@
 """Trainer for DeepSpeech2 model."""
 from paddle import distributed as dist
 
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -42,7 +42,7 @@ if __name__ == "__main__":
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
     if args.opts:
diff --git a/paddlespeech/s2t/exps/deepspeech2/config.py b/paddlespeech/s2t/exps/deepspeech2/config.py
deleted file mode 100644
index 58dc05ff..00000000
--- a/paddlespeech/s2t/exps/deepspeech2/config.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from yacs.config import CfgNode
-
-from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester
-from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer
-from paddlespeech.s2t.io.collator import SpeechCollator
-from paddlespeech.s2t.io.dataset import ManifestDataset
-from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
-from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
-
-
-def get_cfg_defaults(model_type='offline'):
-    _C = CfgNode()
-    _C.data = ManifestDataset.params()
-    _C.collator = SpeechCollator.params()
-    _C.training = DeepSpeech2Trainer.params()
-    _C.decoding = DeepSpeech2Tester.params()
-    if model_type == 'offline':
-        _C.model = DeepSpeech2Model.params()
-    else:
-        _C.model = DeepSpeech2ModelOnline.params()
-    """Get a yacs CfgNode object with default values for my_project."""
-    # Return a clone so that the defaults will not be altered
-    # This is for the "local variable" use pattern
-    config = _C.clone()
-    config.set_new_allowed(True)
-    return config
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index a0b69d64..e7d5e20f 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -49,28 +49,12 @@ logger = Log(__name__).getlog()
 
 
 class DeepSpeech2Trainer(Trainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # training config
-        default = CfgNode(
-            dict(
-                lr=5e-4,  # learning rate
-                lr_decay=1.0,  # learning rate decay
-                weight_decay=1e-6,  # the coeff of weight decay
-                global_grad_clip=5.0,  # the global norm clip
-                n_epoch=50,  # train epochs
-            ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
         super().__init__(config, args)
 
     def train_batch(self, batch_index, batch_data, msg):
-        batch_size = self.config.collator.batch_size
-        accum_grad = self.config.training.accum_grad
+        batch_size = self.config.batch_size
+        accum_grad = self.config.accum_grad
 
         start = time.time()
 
@@ -133,7 +117,7 @@ class DeepSpeech2Trainer(Trainer):
                 total_loss += float(loss) * num_utts
                 valid_losses['val_loss'].append(float(loss))
 
-            if (i + 1) % self.config.training.log_interval == 0:
+            if (i + 1) % self.config.log_interval == 0:
                 valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
                 valid_dump['val_history_loss'] = total_loss / num_seen_utts
 
@@ -154,16 +138,16 @@ class DeepSpeech2Trainer(Trainer):
         config = self.config.clone()
         with UpdateConfig(config):
             if self.train:
-                config.model.input_dim = self.train_loader.collate_fn.feature_size
-                config.model.output_dim = self.train_loader.collate_fn.vocab_size
+                config.input_dim = self.train_loader.collate_fn.feature_size
+                config.output_dim = self.train_loader.collate_fn.vocab_size
             else:
-                config.model.input_dim = self.test_loader.collate_fn.feature_size
-                config.model.output_dim = self.test_loader.collate_fn.vocab_size
+                config.input_dim = self.test_loader.collate_fn.feature_size
+                config.output_dim = self.test_loader.collate_fn.vocab_size
 
         if self.args.model_type == 'offline':
-            model = DeepSpeech2Model.from_config(config.model)
+            model = DeepSpeech2Model.from_config(config)
         elif self.args.model_type == 'online':
-            model = DeepSpeech2ModelOnline.from_config(config.model)
+            model = DeepSpeech2ModelOnline.from_config(config)
         else:
             raise Exception("wrong model type")
         if self.parallel:
@@ -177,17 +161,13 @@ class DeepSpeech2Trainer(Trainer):
         if not self.train:
             return
 
-        grad_clip = ClipGradByGlobalNormWithLog(
-            config.training.global_grad_clip)
+        grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
         lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
-            learning_rate=config.training.lr,
-            gamma=config.training.lr_decay,
-            verbose=True)
+            learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
         optimizer = paddle.optimizer.Adam(
             learning_rate=lr_scheduler,
             parameters=model.parameters(),
-            weight_decay=paddle.regularizer.L2Decay(
-                config.training.weight_decay),
+            weight_decay=paddle.regularizer.L2Decay(config.weight_decay),
             grad_clip=grad_clip)
         self.optimizer = optimizer
         self.lr_scheduler = lr_scheduler
@@ -198,95 +178,75 @@ class DeepSpeech2Trainer(Trainer):
         config.defrost()
         if self.train:
             # train
-            config.data.manifest = config.data.train_manifest
+            config.manifest = config.train_manifest
             train_dataset = ManifestDataset.from_config(config)
             if self.parallel:
                 batch_sampler = SortagradDistributedBatchSampler(
                     train_dataset,
-                    batch_size=config.collator.batch_size,
+                    batch_size=config.batch_size,
                     num_replicas=None,
                     rank=None,
                     shuffle=True,
                     drop_last=True,
-                    sortagrad=config.collator.sortagrad,
-                    shuffle_method=config.collator.shuffle_method)
+                    sortagrad=config.sortagrad,
+                    shuffle_method=config.shuffle_method)
             else:
                 batch_sampler = SortagradBatchSampler(
                     train_dataset,
                     shuffle=True,
-                    batch_size=config.collator.batch_size,
+                    batch_size=config.batch_size,
                     drop_last=True,
-                    sortagrad=config.collator.sortagrad,
-                    shuffle_method=config.collator.shuffle_method)
+                    sortagrad=config.sortagrad,
+                    shuffle_method=config.shuffle_method)
 
-            config.collator.keep_transcription_text = False
+            config.keep_transcription_text = False
             collate_fn_train = SpeechCollator.from_config(config)
             self.train_loader = DataLoader(
                 train_dataset,
                 batch_sampler=batch_sampler,
                 collate_fn=collate_fn_train,
-                num_workers=config.collator.num_workers)
+                num_workers=config.num_workers)
 
             # dev
-            config.data.manifest = config.data.dev_manifest
+            config.manifest = config.dev_manifest
             dev_dataset = ManifestDataset.from_config(config)
 
-            config.collator.augmentation_config = ""
-            config.collator.keep_transcription_text = False
+            config.augmentation_config = ""
+            config.keep_transcription_text = False
             collate_fn_dev = SpeechCollator.from_config(config)
             self.valid_loader = DataLoader(
                 dev_dataset,
-                batch_size=int(config.collator.batch_size),
+                batch_size=int(config.batch_size),
                 shuffle=False,
                 drop_last=False,
                 collate_fn=collate_fn_dev,
-                num_workers=config.collator.num_workers)
+                num_workers=config.num_workers)
             logger.info("Setup train/valid  Dataloader!")
         else:
             # test
-            config.data.manifest = config.data.test_manifest
+            config.manifest = config.test_manifest
             test_dataset = ManifestDataset.from_config(config)
 
-            config.collator.augmentation_config = ""
-            config.collator.keep_transcription_text = True
+            config.augmentation_config = ""
+            config.keep_transcription_text = True
             collate_fn_test = SpeechCollator.from_config(config)
-
+            decode_batch_size = config.get('decode', dict()).get(
+                'decode_batch_size', 1)
             self.test_loader = DataLoader(
                 test_dataset,
-                batch_size=config.decoding.batch_size,
+                batch_size=decode_batch_size,
                 shuffle=False,
                 drop_last=False,
                 collate_fn=collate_fn_test,
-                num_workers=config.collator.num_workers)
+                num_workers=config.num_workers)
             logger.info("Setup test  Dataloader!")
 
 
 class DeepSpeech2Tester(DeepSpeech2Trainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # testing config
-        default = CfgNode(
-            dict(
-                alpha=2.5,  # Coef of LM for beam search.
-                beta=0.3,  # Coef of WC for beam search.
-                cutoff_prob=1.0,  # Cutoff probability for pruning.
-                cutoff_top_n=40,  # Cutoff number for pruning.
-                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
-                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
-                num_proc_bsearch=8,  # # of CPUs for beam search.
-                beam_size=500,  # Beam search width.
-                batch_size=128,  # decoding batch size
-            ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
         super().__init__(config, args)
         self._text_featurizer = TextFeaturizer(
-            unit_type=config.collator.unit_type, vocab=None)
+            unit_type=config.unit_type, vocab=None)
 
     def ordid2token(self, texts, texts_len):
         """ ord() id to chr() chr """
@@ -304,17 +264,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
                         texts,
                         texts_len,
                         fout=None):
-        cfg = self.config.decoding
+        decode_cfg = self.config.decode
         errors_sum, len_refs, num_ins = 0.0, 0, 0
-        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
-        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
 
         vocab_list = self.test_loader.collate_fn.vocab_list
 
         target_transcripts = self.ordid2token(texts, texts_len)
 
-        result_transcripts = self.compute_result_transcripts(audio, audio_len,
-                                                             vocab_list, cfg)
+        result_transcripts = self.compute_result_transcripts(
+            audio, audio_len, vocab_list, decode_cfg)
 
         for utt, target, result in zip(utts, target_transcripts,
                                        result_transcripts):
@@ -327,29 +287,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
             logger.info(f"Utt: {utt}")
             logger.info(f"Ref: {target}")
             logger.info(f"Hyp: {result}")
-            logger.info("Current error rate [%s] = %f" %
-                        (cfg.error_rate_type, error_rate_func(target, result)))
+            logger.info(
+                "Current error rate [%s] = %f" %
+                (decode_cfg.error_rate_type, error_rate_func(target, result)))
 
         return dict(
             errors_sum=errors_sum,
             len_refs=len_refs,
             num_ins=num_ins,
             error_rate=errors_sum / len_refs,
-            error_rate_type=cfg.error_rate_type)
+            error_rate_type=decode_cfg.error_rate_type)
 
-    def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
+    def compute_result_transcripts(self, audio, audio_len, vocab_list,
+                                   decode_cfg):
         result_transcripts = self.model.decode(
             audio,
             audio_len,
             vocab_list,
-            decoding_method=cfg.decoding_method,
-            lang_model_path=cfg.lang_model_path,
-            beam_alpha=cfg.alpha,
-            beam_beta=cfg.beta,
-            beam_size=cfg.beam_size,
-            cutoff_prob=cfg.cutoff_prob,
-            cutoff_top_n=cfg.cutoff_top_n,
-            num_processes=cfg.num_proc_bsearch)
+            decoding_method=decode_cfg.decoding_method,
+            lang_model_path=decode_cfg.lang_model_path,
+            beam_alpha=decode_cfg.alpha,
+            beam_beta=decode_cfg.beta,
+            beam_size=decode_cfg.beam_size,
+            cutoff_prob=decode_cfg.cutoff_prob,
+            cutoff_top_n=decode_cfg.cutoff_top_n,
+            num_processes=decode_cfg.num_proc_bsearch)
 
         return result_transcripts
 
@@ -358,7 +320,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
     def test(self):
         logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
         self.model.eval()
-        cfg = self.config
         error_rate_type = None
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         with jsonlines.open(self.args.result_file, 'w') as fout:
@@ -412,11 +373,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
         if self.args.enable_auto_log is True:
             from paddlespeech.s2t.utils.log import Autolog
             self.autolog = Autolog(
-                batch_size=self.config.decoding.batch_size,
+                batch_size=self.config.decode.decode_batch_size,
                 model_name="deepspeech2",
                 model_precision="fp32").getlog()
         self.model.eval()
-        cfg = self.config
         error_rate_type = None
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         with jsonlines.open(self.args.result_file, 'w') as fout:
@@ -441,7 +401,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
         if self.args.enable_auto_log is True:
             self.autolog.report()
 
-    def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
+    def compute_result_transcripts(self, audio, audio_len, vocab_list,
+                                   decode_cfg):
         if self.args.model_type == "online":
             output_probs, output_lens = self.static_forward_online(audio,
                                                                    audio_len)
@@ -454,13 +415,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
         self.predictor.clear_intermediate_tensor()
         self.predictor.try_shrink_memory()
 
-        self.model.decoder.init_decode(cfg.alpha, cfg.beta, cfg.lang_model_path,
-                                       vocab_list, cfg.decoding_method)
+        self.model.decoder.init_decode(decode_cfg.alpha, decode_cfg.beta,
+                                       decode_cfg.lang_model_path, vocab_list,
+                                       decode_cfg.decoding_method)
 
         result_transcripts = self.model.decoder.decode_probs(
-            output_probs, output_lens, vocab_list, cfg.decoding_method,
-            cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size,
-            cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch)
+            output_probs, output_lens, vocab_list, decode_cfg.decoding_method,
+            decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
+            decode_cfg.beam_size, decode_cfg.cutoff_prob,
+            decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
         #replace the <space> with ' '
         result_transcripts = [
             self._text_featurizer.detokenize(sentence)
@@ -531,12 +494,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
             num_chunk = int(num_chunk)
 
             chunk_state_h_box = np.zeros(
-                (self.config.model.num_rnn_layers, 1,
-                 self.config.model.rnn_layer_size),
+                (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
                 dtype=x.dtype)
             chunk_state_c_box = np.zeros(
-                (self.config.model.num_rnn_layers, 1,
-                 self.config.model.rnn_layer_size),
+                (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
                 dtype=x.dtype)
 
             input_names = self.predictor.get_input_names()
diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py
index df95baeb..e3390feb 100644
--- a/paddlespeech/s2t/exps/u2/bin/alignment.py
+++ b/paddlespeech/s2t/exps/u2/bin/alignment.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Alignment for U2 model."""
-from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
+from yacs.config import CfgNode
+
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -31,16 +32,20 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to 
+    # save asr result to
     parser.add_argument(
         "--result_file", type=str, help="path of save the asr result")
     args = parser.parse_args()
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
diff --git a/paddlespeech/s2t/exps/u2/bin/export.py b/paddlespeech/s2t/exps/u2/bin/export.py
index 44fc7c3e..3907cebd 100644
--- a/paddlespeech/s2t/exps/u2/bin/export.py
+++ b/paddlespeech/s2t/exps/u2/bin/export.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Export for U2 model."""
-from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -31,14 +31,14 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save jit model to 
+    # save jit model to
     parser.add_argument(
         "--export_path", type=str, help="path of the jit model to save")
     args = parser.parse_args()
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
     if args.opts:
diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py
index 48b0670d..f14d804f 100644
--- a/paddlespeech/s2t/exps/u2/bin/test.py
+++ b/paddlespeech/s2t/exps/u2/bin/test.py
@@ -14,12 +14,13 @@
 """Evaluation for U2 model."""
 import cProfile
 
-from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
+from yacs.config import CfgNode
+
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
 
-# TODO(hui zhang): dynamic load 
+# TODO(hui zhang): dynamic load
 
 
 def main_sp(config, args):
@@ -35,16 +36,20 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to 
+    # save asr result to
     parser.add_argument(
         "--result_file", type=str, help="path of save the asr result")
     args = parser.parse_args()
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py
index 556316ec..9904813a 100644
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -18,8 +18,8 @@ from pathlib import Path
 
 import paddle
 import soundfile
+from yacs.config import CfgNode
 
-from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.models.u2 import U2Model
 from paddlespeech.s2t.training.cli import default_argument_parser
@@ -36,23 +36,22 @@ class U2Infer():
         self.args = args
         self.config = config
         self.audio_file = args.audio_file
-        self.sr = config.collator.target_sample_rate
 
-        self.preprocess_conf = config.collator.augmentation_config
+        self.preprocess_conf = config.preprocess_config
         self.preprocess_args = {"train": False}
         self.preprocessing = Transformation(self.preprocess_conf)
 
         self.text_feature = TextFeaturizer(
-            unit_type=config.collator.unit_type,
-            vocab=config.collator.vocab_filepath,
-            spm_model_prefix=config.collator.spm_model_prefix)
+            unit_type=config.unit_type,
+            vocab=config.vocab_filepath,
+            spm_model_prefix=config.spm_model_prefix)
 
         paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
         # model
-        model_conf = config.model
+        model_conf = config
         with UpdateConfig(model_conf):
-            model_conf.input_dim = config.collator.feat_dim
+            model_conf.input_dim = config.feat_dim
             model_conf.output_dim = self.text_feature.vocab_size
         model = U2Model.from_config(model_conf)
         self.model = model
@@ -70,10 +69,6 @@ class U2Infer():
             # read
             audio, sample_rate = soundfile.read(
                 self.audio_file, dtype="int16", always_2d=True)
-            if sample_rate != self.sr:
-                logger.error(
-                    f"sample rate error: {sample_rate}, need {self.sr} ")
-                sys.exit(-1)
 
             audio = audio[:, 0]
             logger.info(f"audio shape: {audio.shape}")
@@ -85,17 +80,17 @@ class U2Infer():
             ilen = paddle.to_tensor(feat.shape[0])
             xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)
 
-            cfg = self.config.decoding
+            decode_config = self.config.decode
             result_transcripts = self.model.decode(
                 xs,
                 ilen,
                 text_feature=self.text_feature,
-                decoding_method=cfg.decoding_method,
-                beam_size=cfg.beam_size,
-                ctc_weight=cfg.ctc_weight,
-                decoding_chunk_size=cfg.decoding_chunk_size,
-                num_decoding_left_chunks=cfg.num_decoding_left_chunks,
-                simulate_streaming=cfg.simulate_streaming)
+                decoding_method=decode_config.decoding_method,
+                beam_size=decode_config.beam_size,
+                ctc_weight=decode_config.ctc_weight,
+                decoding_chunk_size=decode_config.decoding_chunk_size,
+                num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
+                simulate_streaming=decode_config.simulate_streaming)
             rsl = result_transcripts[0][0]
             utt = Path(self.audio_file).name
             logger.info(f"hyp: {utt} {result_transcripts[0][0]}")
@@ -133,9 +128,13 @@ if __name__ == "__main__":
         "--audio_file", type=str, help="path of the input audio file")
     args = parser.parse_args()
 
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py
index d6ee8b30..d562278f 100644
--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
@@ -17,7 +17,7 @@ import os
 
 from paddle import distributed as dist
 
-from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -44,7 +44,7 @@ if __name__ == "__main__":
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
     if args.opts:
diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py
deleted file mode 100644
index 898b0bb2..00000000
--- a/paddlespeech/s2t/exps/u2/config.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from yacs.config import CfgNode
-
-from paddlespeech.s2t.exps.u2.model import U2Tester
-from paddlespeech.s2t.exps.u2.model import U2Trainer
-from paddlespeech.s2t.io.collator import SpeechCollator
-from paddlespeech.s2t.io.dataset import ManifestDataset
-from paddlespeech.s2t.models.u2 import U2Model
-
-_C = CfgNode()
-
-_C.data = ManifestDataset.params()
-
-_C.collator = SpeechCollator.params()
-
-_C.model = U2Model.params()
-
-_C.training = U2Trainer.params()
-
-_C.decoding = U2Tester.params()
-
-
-def get_cfg_defaults():
-    """Get a yacs CfgNode object with default values for my_project."""
-    # Return a clone so that the defaults will not be altered
-    # This is for the "local variable" use pattern
-    config = _C.clone()
-    config.set_new_allowed(True)
-    return config
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 6b529b40..d0cea031 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -46,38 +46,11 @@ logger = Log(__name__).getlog()
 
 
 class U2Trainer(Trainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # training config
-        default = CfgNode(
-            dict(
-                n_epoch=50,  # train epochs
-                log_interval=100,  # steps
-                accum_grad=1,  # accum grad by # steps
-                global_grad_clip=5.0,  # the global norm clip
-            ))
-        default.optim = 'adam'
-        default.optim_conf = CfgNode(
-            dict(
-                lr=5e-4,  # learning rate
-                weight_decay=1e-6,  # the coeff of weight decay
-            ))
-        default.scheduler = 'warmuplr'
-        default.scheduler_conf = CfgNode(
-            dict(
-                warmup_steps=25000,
-                lr_decay=1.0,  # learning rate decay
-            ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
         super().__init__(config, args)
 
     def train_batch(self, batch_index, batch_data, msg):
-        train_conf = self.config.training
+        train_conf = self.config
         start = time.time()
 
         # forward
@@ -120,7 +93,7 @@ class U2Trainer(Trainer):
 
         for k, v in losses_np.items():
             report(k, v)
-        report("batch_size", self.config.collator.batch_size)
+        report("batch_size", self.config.batch_size)
         report("accum", train_conf.accum_grad)
         report("step_cost", iteration_time)
 
@@ -153,7 +126,7 @@ class U2Trainer(Trainer):
                 if ctc_loss:
                     valid_losses['val_ctc_loss'].append(float(ctc_loss))
 
-            if (i + 1) % self.config.training.log_interval == 0:
+            if (i + 1) % self.config.log_interval == 0:
                 valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
                 valid_dump['val_history_loss'] = total_loss / num_seen_utts
 
@@ -182,7 +155,7 @@ class U2Trainer(Trainer):
         self.before_train()
 
         logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
-        while self.epoch < self.config.training.n_epoch:
+        while self.epoch < self.config.n_epoch:
             with Timer("Epoch-Train Time Cost: {}"):
                 self.model.train()
                 try:
@@ -214,8 +187,7 @@ class U2Trainer(Trainer):
                                 k.split(',')) == 2 else ""
                             msg += ","
                         msg = msg[:-1]  # remove the last ","
-                        if (batch_index + 1
-                            ) % self.config.training.log_interval == 0:
+                        if (batch_index + 1) % self.config.log_interval == 0:
                             logger.info(msg)
                         data_start_time = time.time()
                 except Exception as e:
@@ -252,29 +224,29 @@ class U2Trainer(Trainer):
         if self.train:
             # train/valid dataset, return token ids
             self.train_loader = BatchDataLoader(
-                json_file=config.data.train_manifest,
+                json_file=config.train_manifest,
                 train_mode=True,
-                sortagrad=config.collator.sortagrad,
-                batch_size=config.collator.batch_size,
-                maxlen_in=config.collator.maxlen_in,
-                maxlen_out=config.collator.maxlen_out,
-                minibatches=config.collator.minibatches,
+                sortagrad=config.sortagrad,
+                batch_size=config.batch_size,
+                maxlen_in=config.maxlen_in,
+                maxlen_out=config.maxlen_out,
+                minibatches=config.minibatches,
                 mini_batch_size=self.args.ngpu,
-                batch_count=config.collator.batch_count,
-                batch_bins=config.collator.batch_bins,
-                batch_frames_in=config.collator.batch_frames_in,
-                batch_frames_out=config.collator.batch_frames_out,
-                batch_frames_inout=config.collator.batch_frames_inout,
-                preprocess_conf=config.collator.augmentation_config,
-                n_iter_processes=config.collator.num_workers,
+                batch_count=config.batch_count,
+                batch_bins=config.batch_bins,
+                batch_frames_in=config.batch_frames_in,
+                batch_frames_out=config.batch_frames_out,
+                batch_frames_inout=config.batch_frames_inout,
+                preprocess_conf=config.preprocess_config,
+                n_iter_processes=config.num_workers,
                 subsampling_factor=1,
                 num_encs=1)
 
             self.valid_loader = BatchDataLoader(
-                json_file=config.data.dev_manifest,
+                json_file=config.dev_manifest,
                 train_mode=False,
                 sortagrad=False,
-                batch_size=config.collator.batch_size,
+                batch_size=config.batch_size,
                 maxlen_in=float('inf'),
                 maxlen_out=float('inf'),
                 minibatches=0,
@@ -284,18 +256,20 @@ class U2Trainer(Trainer):
                 batch_frames_in=0,
                 batch_frames_out=0,
                 batch_frames_inout=0,
-                preprocess_conf=config.collator.augmentation_config,
-                n_iter_processes=config.collator.num_workers,
+                preprocess_conf=config.preprocess_config,
+                n_iter_processes=config.num_workers,
                 subsampling_factor=1,
                 num_encs=1)
             logger.info("Setup train/valid Dataloader!")
         else:
+            decode_batch_size = config.get('decode', dict()).get(
+                'decode_batch_size', 1)
             # test dataset, return raw text
             self.test_loader = BatchDataLoader(
-                json_file=config.data.test_manifest,
+                json_file=config.test_manifest,
                 train_mode=False,
                 sortagrad=False,
-                batch_size=config.decoding.batch_size,
+                batch_size=decode_batch_size,
                 maxlen_in=float('inf'),
                 maxlen_out=float('inf'),
                 minibatches=0,
@@ -305,16 +279,16 @@ class U2Trainer(Trainer):
                 batch_frames_in=0,
                 batch_frames_out=0,
                 batch_frames_inout=0,
-                preprocess_conf=config.collator.augmentation_config,
+                preprocess_conf=config.preprocess_config,
                 n_iter_processes=1,
                 subsampling_factor=1,
                 num_encs=1)
 
             self.align_loader = BatchDataLoader(
-                json_file=config.data.test_manifest,
+                json_file=config.test_manifest,
                 train_mode=False,
                 sortagrad=False,
-                batch_size=config.decoding.batch_size,
+                batch_size=decode_batch_size,
                 maxlen_in=float('inf'),
                 maxlen_out=float('inf'),
                 minibatches=0,
@@ -324,7 +298,7 @@ class U2Trainer(Trainer):
                 batch_frames_in=0,
                 batch_frames_out=0,
                 batch_frames_inout=0,
-                preprocess_conf=config.collator.augmentation_config,
+                preprocess_conf=config.preprocess_config,
                 n_iter_processes=1,
                 subsampling_factor=1,
                 num_encs=1)
@@ -332,7 +306,7 @@ class U2Trainer(Trainer):
 
     def setup_model(self):
         config = self.config
-        model_conf = config.model
+        model_conf = config
 
         with UpdateConfig(model_conf):
             if self.train:
@@ -355,7 +329,7 @@ class U2Trainer(Trainer):
         if not self.train:
             return
 
-        train_config = config.training
+        train_config = config
         optim_type = train_config.optim
         optim_conf = train_config.optim_conf
         scheduler_type = train_config.scheduler
@@ -375,7 +349,7 @@ class U2Trainer(Trainer):
                 config,
                 parameters,
                 lr_scheduler=None, ):
-            train_config = config.training
+            train_config = config
             optim_type = train_config.optim
             optim_conf = train_config.optim_conf
             scheduler_type = train_config.scheduler
@@ -400,41 +374,12 @@ class U2Trainer(Trainer):
 
 
 class U2Tester(U2Trainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # decoding config
-        default = CfgNode(
-            dict(
-                alpha=2.5,  # Coef of LM for beam search.
-                beta=0.3,  # Coef of WC for beam search.
-                cutoff_prob=1.0,  # Cutoff probability for pruning.
-                cutoff_top_n=40,  # Cutoff number for pruning.
-                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-                decoding_method='attention',  # Decoding method. Options: 'attention', 'ctc_greedy_search',
-                # 'ctc_prefix_beam_search', 'attention_rescoring'
-                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
-                num_proc_bsearch=8,  # # of CPUs for beam search.
-                beam_size=10,  # Beam search width.
-                batch_size=16,  # decoding batch size
-                ctc_weight=0.0,  # ctc weight for attention rescoring decode mode.
-                decoding_chunk_size=-1,  # decoding chunk size. Defaults to -1.
-                # <0: for decoding, use full chunk.
-                # >0: for decoding, use fixed chunk size as set.
-                # 0: used for training, it's prohibited here.
-                num_decoding_left_chunks=-1,  # number of left chunks for decoding. Defaults to -1.
-                simulate_streaming=False,  # simulate streaming inference. Defaults to False.
-            ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
         super().__init__(config, args)
         self.text_feature = TextFeaturizer(
-            unit_type=self.config.collator.unit_type,
-            vocab=self.config.collator.vocab_filepath,
-            spm_model_prefix=self.config.collator.spm_model_prefix)
+            unit_type=self.config.unit_type,
+            vocab=self.config.vocab_filepath,
+            spm_model_prefix=self.config.spm_model_prefix)
         self.vocab_list = self.text_feature.vocab_list
 
     def id2token(self, texts, texts_len, text_feature):
@@ -453,10 +398,10 @@ class U2Tester(U2Trainer):
                         texts,
                         texts_len,
                         fout=None):
-        cfg = self.config.decoding
+        decode_config = self.config.decode
         errors_sum, len_refs, num_ins = 0.0, 0, 0
-        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
-        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
+        errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer
 
         start_time = time.time()
         target_transcripts = self.id2token(texts, texts_len, self.text_feature)
@@ -464,12 +409,12 @@ class U2Tester(U2Trainer):
             audio,
             audio_len,
             text_feature=self.text_feature,
-            decoding_method=cfg.decoding_method,
-            beam_size=cfg.beam_size,
-            ctc_weight=cfg.ctc_weight,
-            decoding_chunk_size=cfg.decoding_chunk_size,
-            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
-            simulate_streaming=cfg.simulate_streaming)
+            decoding_method=decode_config.decoding_method,
+            beam_size=decode_config.beam_size,
+            ctc_weight=decode_config.ctc_weight,
+            decoding_chunk_size=decode_config.decoding_chunk_size,
+            num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
+            simulate_streaming=decode_config.simulate_streaming)
         decode_time = time.time() - start_time
 
         for utt, target, result, rec_tids in zip(
@@ -488,15 +433,15 @@ class U2Tester(U2Trainer):
             logger.info(f"Utt: {utt}")
             logger.info(f"Ref: {target}")
             logger.info(f"Hyp: {result}")
-            logger.info("One example error rate [%s] = %f" %
-                        (cfg.error_rate_type, error_rate_func(target, result)))
+            logger.info("One example error rate [%s] = %f" % (
+                decode_config.error_rate_type, error_rate_func(target, result)))
 
         return dict(
             errors_sum=errors_sum,
             len_refs=len_refs,
             num_ins=num_ins,  # num examples
             error_rate=errors_sum / len_refs,
-            error_rate_type=cfg.error_rate_type,
+            error_rate_type=decode_config.error_rate_type,
             num_frames=audio_len.sum().numpy().item(),
             decode_time=decode_time)
 
@@ -507,7 +452,7 @@ class U2Tester(U2Trainer):
         self.model.eval()
         logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
 
-        stride_ms = self.config.collator.stride_ms
+        stride_ms = self.config.stride_ms
         error_rate_type = None
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         num_frames = 0.0
@@ -558,15 +503,15 @@ class U2Tester(U2Trainer):
                 "ref_len":
                 len_refs,
                 "decode_method":
-                self.config.decoding.decoding_method,
+                self.config.decode.decoding_method,
             })
             f.write(data + '\n')
 
     @paddle.no_grad()
     def align(self):
         ctc_utils.ctc_align(self.config, self.model, self.align_loader,
-                            self.config.decoding.batch_size,
-                            self.config.collator.stride_ms, self.vocab_list,
+                            self.config.decode.decode_batch_size,
+                            self.config.stride_ms, self.vocab_list,
                             self.args.result_file)
 
     def load_inferspec(self):
@@ -577,10 +522,10 @@ class U2Tester(U2Trainer):
             List[paddle.static.InputSpec]: input spec.
         """
         from paddlespeech.s2t.models.u2 import U2InferModel
-        infer_model = U2InferModel.from_pretrained(self.test_loader,
-                                                   self.config.model.clone(),
+        infer_model = U2InferModel.from_pretrained(self.train_loader,
+                                                   self.config.clone(),
                                                    self.args.checkpoint_path)
-        feat_dim = self.test_loader.feat_dim
+        feat_dim = self.train_loader.feat_dim
         input_spec = [
             paddle.static.InputSpec(shape=[1, None, feat_dim],
                                     dtype='float32'),  # audio, [B,T,D]
diff --git a/paddlespeech/s2t/exps/u2/trainer.py b/paddlespeech/s2t/exps/u2/trainer.py
index 22a0a3c5..ab87c30d 100644
--- a/paddlespeech/s2t/exps/u2/trainer.py
+++ b/paddlespeech/s2t/exps/u2/trainer.py
@@ -44,77 +44,75 @@ class U2Trainer(Trainer):
     def setup_dataloader(self):
         config = self.config.clone()
         config.defrost()
-        config.collator.keep_transcription_text = False
+        config.keep_transcription_text = False
 
         # train/valid dataset, return token ids
-        config.data.manifest = config.data.train_manifest
+        config.manifest = config.train_manifest
         train_dataset = ManifestDataset.from_config(config)
 
-        config.data.manifest = config.data.dev_manifest
+        config.manifest = config.dev_manifest
         dev_dataset = ManifestDataset.from_config(config)
 
         collate_fn_train = SpeechCollator.from_config(config)
 
-        config.collator.augmentation_config = ""
         collate_fn_dev = SpeechCollator.from_config(config)
 
         if self.parallel:
             batch_sampler = SortagradDistributedBatchSampler(
                 train_dataset,
-                batch_size=config.collator.batch_size,
+                batch_size=config.batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=True,
                 drop_last=True,
-                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
+                sortagrad=config.sortagrad,
+                shuffle_method=config.shuffle_method)
         else:
             batch_sampler = SortagradBatchSampler(
                 train_dataset,
                 shuffle=True,
-                batch_size=config.collator.batch_size,
+                batch_size=config.batch_size,
                 drop_last=True,
-                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
+                sortagrad=config.sortagrad,
+                shuffle_method=config.shuffle_method)
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
             collate_fn=collate_fn_train,
-            num_workers=config.collator.num_workers, )
+            num_workers=config.num_workers, )
         self.valid_loader = DataLoader(
             dev_dataset,
-            batch_size=config.collator.batch_size,
+            batch_size=config.batch_size,
             shuffle=False,
             drop_last=False,
             collate_fn=collate_fn_dev,
-            num_workers=config.collator.num_workers, )
+            num_workers=config.num_workers, )
 
         # test dataset, return raw text
-        config.data.manifest = config.data.test_manifest
+        config.manifest = config.test_manifest
         # filter test examples, will cause less examples, but no mismatch with training
         # and can use large batch size , save training time, so filter test egs now.
-        config.data.min_input_len = 0.0  # second
-        config.data.max_input_len = float('inf')  # second
-        config.data.min_output_len = 0.0  # tokens
-        config.data.max_output_len = float('inf')  # tokens
-        config.data.min_output_input_ratio = 0.00
-        config.data.max_output_input_ratio = float('inf')
+        config.min_input_len = 0.0  # second
+        config.max_input_len = float('inf')  # second
+        config.min_output_len = 0.0  # tokens
+        config.max_output_len = float('inf')  # tokens
+        config.min_output_input_ratio = 0.00
+        config.max_output_input_ratio = float('inf')
 
         test_dataset = ManifestDataset.from_config(config)
         # return text ord id
-        config.collator.keep_transcription_text = True
-        config.collator.augmentation_config = ""
+        config.keep_transcription_text = True
         self.test_loader = DataLoader(
             test_dataset,
-            batch_size=config.decoding.batch_size,
+            batch_size=config.decode.batch_size,
             shuffle=False,
             drop_last=False,
             collate_fn=SpeechCollator.from_config(config))
         # return text token id
-        config.collator.keep_transcription_text = False
+        config.keep_transcription_text = False
         self.align_loader = DataLoader(
             test_dataset,
-            batch_size=config.decoding.batch_size,
+            batch_size=config.decode.batch_size,
             shuffle=False,
             drop_last=False,
             collate_fn=SpeechCollator.from_config(config))
@@ -122,7 +120,7 @@ class U2Trainer(Trainer):
 
     def setup_model(self):
         config = self.config
-        model_conf = config.model
+        model_conf = config
         with UpdateConfig(model_conf):
             model_conf.input_dim = self.train_loader.collate_fn.feature_size
             model_conf.output_dim = self.train_loader.collate_fn.vocab_size
@@ -136,7 +134,7 @@ class U2Trainer(Trainer):
         logger.info(f"{model}")
         layer_tools.print_params(model, logger.info)
 
-        train_config = config.training
+        train_config = config
         optim_type = train_config.optim
         optim_conf = train_config.optim_conf
         scheduler_type = train_config.scheduler
@@ -156,7 +154,7 @@ class U2Trainer(Trainer):
                 config,
                 parameters,
                 lr_scheduler=None, ):
-            train_config = config.training
+            train_config = config
             optim_type = train_config.optim
             optim_conf = train_config.optim_conf
             scheduler_type = train_config.scheduler
@@ -182,7 +180,7 @@ class U2Trainer(Trainer):
 
     def setup_updater(self):
         output_dir = self.output_dir
-        config = self.config.training
+        config = self.config
 
         updater = U2Updater(
             model=self.model,
diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
index 67bed349..422483b9 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
@@ -69,6 +69,10 @@ if __name__ == "__main__":
     config = CfgNode()
     config.set_new_allowed(True)
     config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index 9b8274ad..780c5c08 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -42,45 +42,12 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 
 logger = Log(__name__).getlog()
 
-
-def get_cfg_defaults():
-    """Get a yacs CfgNode object with default values for my_project."""
-    # Return a clone so that the defaults will not be altered
-    # This is for the "local variable" use pattern
-    _C = CfgNode()
-
-    _C.model = U2Model.params()
-
-    _C.training = U2Trainer.params()
-
-    _C.decoding = U2Tester.params()
-
-    config = _C.clone()
-    config.set_new_allowed(True)
-    return config
-
-
 class U2Trainer(Trainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # training config
-        default = CfgNode(
-            dict(
-                n_epoch=50,  # train epochs
-                log_interval=100,  # steps
-                accum_grad=1,  # accum grad by # steps
-                checkpoint=dict(
-                    kbest_n=50,
-                    latest_n=5, ), ))
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
         super().__init__(config, args)
 
     def train_batch(self, batch_index, batch_data, msg):
-        train_conf = self.config.training
+        train_conf = self.config
         start = time.time()
 
         # forward
@@ -122,7 +89,7 @@ class U2Trainer(Trainer):
 
         if (batch_index + 1) % train_conf.log_interval == 0:
             msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.collator.batch_size)
+            msg += "batch size: {}, ".format(self.config.batch_size)
             msg += "accum: {}, ".format(train_conf.accum_grad)
             msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_np.items())
@@ -157,7 +124,7 @@ class U2Trainer(Trainer):
                 if ctc_loss:
                     valid_losses['val_ctc_loss'].append(float(ctc_loss))
 
-            if (i + 1) % self.config.training.log_interval == 0:
+            if (i + 1) % self.config.log_interval == 0:
                 valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
                 valid_dump['val_history_loss'] = total_loss / num_seen_utts
 
@@ -186,7 +153,7 @@ class U2Trainer(Trainer):
         self.before_train()
 
         logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
-        while self.epoch < self.config.training.n_epoch:
+        while self.epoch < self.config.n_epoch:
             with Timer("Epoch-Train Time Cost: {}"):
                 self.model.train()
                 try:
@@ -235,10 +202,10 @@ class U2Trainer(Trainer):
         config = self.config.clone()
         # train/valid dataset, return token ids
         self.train_loader = BatchDataLoader(
-            json_file=config.data.train_manifest,
+            json_file=config.train_manifest,
             train_mode=True,
             sortagrad=False,
-            batch_size=config.collator.batch_size,
+            batch_size=config.batch_size,
             maxlen_in=float('inf'),
             maxlen_out=float('inf'),
             minibatches=0,
@@ -248,16 +215,16 @@ class U2Trainer(Trainer):
             batch_frames_in=0,
             batch_frames_out=0,
             batch_frames_inout=0,
-            preprocess_conf=config.collator.augmentation_config,
-            n_iter_processes=config.collator.num_workers,
+            preprocess_conf=config.preprocess_config,
+            n_iter_processes=config.num_workers,
             subsampling_factor=1,
             num_encs=1)
 
         self.valid_loader = BatchDataLoader(
-            json_file=config.data.dev_manifest,
+            json_file=config.dev_manifest,
             train_mode=False,
             sortagrad=False,
-            batch_size=config.collator.batch_size,
+            batch_size=config.batch_size,
             maxlen_in=float('inf'),
             maxlen_out=float('inf'),
             minibatches=0,
@@ -268,16 +235,18 @@ class U2Trainer(Trainer):
             batch_frames_out=0,
             batch_frames_inout=0,
             preprocess_conf=None,
-            n_iter_processes=config.collator.num_workers,
+            n_iter_processes=config.num_workers,
             subsampling_factor=1,
             num_encs=1)
 
+        decode_batch_size = config.get('decode', dict()).get(
+            'decode_batch_size', 1)
         # test dataset, return raw text
         self.test_loader = BatchDataLoader(
-            json_file=config.data.test_manifest,
+            json_file=config.test_manifest,
             train_mode=False,
             sortagrad=False,
-            batch_size=config.decoding.batch_size,
+            batch_size=decode_batch_size,
             maxlen_in=float('inf'),
             maxlen_out=float('inf'),
             minibatches=0,
@@ -293,10 +262,10 @@ class U2Trainer(Trainer):
             num_encs=1)
 
         self.align_loader = BatchDataLoader(
-            json_file=config.data.test_manifest,
+            json_file=config.test_manifest,
             train_mode=False,
             sortagrad=False,
-            batch_size=config.decoding.batch_size,
+            batch_size=decode_batch_size,
             maxlen_in=float('inf'),
             maxlen_out=float('inf'),
             minibatches=0,
@@ -316,7 +285,7 @@ class U2Trainer(Trainer):
         config = self.config
 
         # model
-        model_conf = config.model
+        model_conf = config
         with UpdateConfig(model_conf):
             model_conf.input_dim = self.train_loader.feat_dim
             model_conf.output_dim = self.train_loader.vocab_size
@@ -360,41 +329,12 @@ class U2Trainer(Trainer):
 
 
 class U2Tester(U2Trainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # decoding config
-        default = CfgNode(
-            dict(
-                alpha=2.5,  # Coef of LM for beam search.
-                beta=0.3,  # Coef of WC for beam search.
-                cutoff_prob=1.0,  # Cutoff probability for pruning.
-                cutoff_top_n=40,  # Cutoff number for pruning.
-                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-                decoding_method='attention',  # Decoding method. Options: 'attention', 'ctc_greedy_search',
-                # 'ctc_prefix_beam_search', 'attention_rescoring'
-                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
-                num_proc_bsearch=8,  # # of CPUs for beam search.
-                beam_size=10,  # Beam search width.
-                batch_size=16,  # decoding batch size
-                ctc_weight=0.0,  # ctc weight for attention rescoring decode mode.
-                decoding_chunk_size=-1,  # decoding chunk size. Defaults to -1.
-                # <0: for decoding, use full chunk.
-                # >0: for decoding, use fixed chunk size as set.
-                # 0: used for training, it's prohibited here.
-                num_decoding_left_chunks=-1,  # number of left chunks for decoding. Defaults to -1.
-                simulate_streaming=False,  # simulate streaming inference. Defaults to False.
-            ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
         super().__init__(config, args)
         self.text_feature = TextFeaturizer(
-            unit_type=self.config.collator.unit_type,
-            vocab=self.config.collator.vocab_filepath,
-            spm_model_prefix=self.config.collator.spm_model_prefix)
+            unit_type=self.config.unit_type,
+            vocab=self.config.vocab_filepath,
+            spm_model_prefix=self.config.spm_model_prefix)
         self.vocab_list = self.text_feature.vocab_list
 
     def id2token(self, texts, texts_len, text_feature):
@@ -413,10 +353,10 @@ class U2Tester(U2Trainer):
                         texts,
                         texts_len,
                         fout=None):
-        cfg = self.config.decoding
+        decode_cfg = self.config.decode
         errors_sum, len_refs, num_ins = 0.0, 0, 0
-        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
-        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
 
         start_time = time.time()
         target_transcripts = self.id2token(texts, texts_len, self.text_feature)
@@ -424,12 +364,12 @@ class U2Tester(U2Trainer):
             audio,
             audio_len,
             text_feature=self.text_feature,
-            decoding_method=cfg.decoding_method,
-            beam_size=cfg.beam_size,
-            ctc_weight=cfg.ctc_weight,
-            decoding_chunk_size=cfg.decoding_chunk_size,
-            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
-            simulate_streaming=cfg.simulate_streaming)
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            ctc_weight=decode_cfg.ctc_weight,
+            decoding_chunk_size=decode_cfg.decoding_chunk_size,
+            num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
+            simulate_streaming=decode_cfg.simulate_streaming)
         decode_time = time.time() - start_time
 
         for i, (utt, target, result, rec_tids) in enumerate(
@@ -449,15 +389,16 @@ class U2Tester(U2Trainer):
             logger.info(f"Utt: {utt}")
             logger.info(f"Ref: {target}")
             logger.info(f"Hyp: {result}")
-            logger.info("One example error rate [%s] = %f" %
-                        (cfg.error_rate_type, error_rate_func(target, result)))
+            logger.info(
+                "One example error rate [%s] = %f" %
+                (decode_cfg.error_rate_type, error_rate_func(target, result)))
 
         return dict(
             errors_sum=errors_sum,
             len_refs=len_refs,
             num_ins=num_ins,  # num examples
             error_rate=errors_sum / len_refs,
-            error_rate_type=cfg.error_rate_type,
+            error_rate_type=decode_cfg.error_rate_type,
             num_frames=audio_len.sum().numpy().item(),
             decode_time=decode_time)
 
@@ -468,7 +409,7 @@ class U2Tester(U2Trainer):
         self.model.eval()
         logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
 
-        stride_ms = self.config.collator.stride_ms
+        stride_ms = self.config.stride_ms
         error_rate_type = None
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         num_frames = 0.0
@@ -519,15 +460,15 @@ class U2Tester(U2Trainer):
                 "ref_len":
                 len_refs,
                 "decode_method":
-                self.config.decoding.decoding_method,
+                self.config.decode.decoding_method,
             })
             f.write(data + '\n')
 
     @paddle.no_grad()
     def align(self):
         ctc_utils.ctc_align(self.config, self.model, self.align_loader,
-                            self.config.decoding.batch_size,
-                            self.config.collator.stride_ms, self.vocab_list,
+                            self.config.decode.decode_batch_size,
+                            self.config.stride_ms, self.vocab_list,
                             self.args.result_file)
 
     def load_inferspec(self):
@@ -539,7 +480,7 @@ class U2Tester(U2Trainer):
         """
         from paddlespeech.s2t.models.u2 import U2InferModel
         infer_model = U2InferModel.from_pretrained(self.test_loader,
-                                                   self.config.model.clone(),
+                                                   self.config.clone(),
                                                    self.args.checkpoint_path)
         feat_dim = self.test_loader.feat_dim
         input_spec = [
diff --git a/paddlespeech/s2t/exps/u2_st/bin/export.py b/paddlespeech/s2t/exps/u2_st/bin/export.py
index 69d9718f..1bc4e1f3 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/export.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/export.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Export for U2 model."""
-from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -31,14 +31,14 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save jit model to 
+    # save jit model to
     parser.add_argument(
         "--export_path", type=str, help="path of the jit model to save")
     args = parser.parse_args()
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
     if args.opts:
diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py
index 93c2fee0..1d70a310 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/test.py
@@ -14,12 +14,13 @@
 """Evaluation for U2 model."""
 import cProfile
 
-from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
+from yacs.config import CfgNode
+
 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
 
-# TODO(hui zhang): dynamic load 
+# TODO(hui zhang): dynamic load
 
 
 def main_sp(config, args):
@@ -35,16 +36,20 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to 
+    # save asr result to
     parser.add_argument(
         "--result_file", type=str, help="path of save the asr result")
     args = parser.parse_args()
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_conf = CfgNode(new_allowed=True)
+        decode_conf.merge_from_file(args.decode_cfg)
+        config.decode = decode_conf
     if args.opts:
         config.merge_from_list(args.opts)
     config.freeze()
diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py
index 58496c88..4dec9ec8 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
@@ -16,8 +16,8 @@ import cProfile
 import os
 
 from paddle import distributed as dist
+from yacs.config import CfgNode
 
-from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
 from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -42,7 +42,7 @@ if __name__ == "__main__":
     print_arguments(args, globals())
 
     # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
     if args.config:
         config.merge_from_file(args.config)
     if args.opts:
diff --git a/paddlespeech/s2t/exps/u2_st/config.py b/paddlespeech/s2t/exps/u2_st/config.py
deleted file mode 100644
index a48f9106..00000000
--- a/paddlespeech/s2t/exps/u2_st/config.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from yacs.config import CfgNode
-
-from paddlespeech.s2t.exps.u2_st.model import U2STTester
-from paddlespeech.s2t.exps.u2_st.model import U2STTrainer
-from paddlespeech.s2t.io.collator import SpeechCollator
-from paddlespeech.s2t.io.dataset import ManifestDataset
-from paddlespeech.s2t.models.u2_st import U2STModel
-
-_C = CfgNode()
-
-_C.data = ManifestDataset.params()
-
-_C.collator = SpeechCollator.params()
-
-_C.model = U2STModel.params()
-
-_C.training = U2STTrainer.params()
-
-_C.decoding = U2STTester.params()
-
-
-def get_cfg_defaults():
-    """Get a yacs CfgNode object with default values for my_project."""
-    # Return a clone so that the defaults will not be altered
-    # This is for the "local variable" use pattern
-    config = _C.clone()
-    config.set_new_allowed(True)
-    return config
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index 89408786..ca2c2c1d 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -45,38 +45,11 @@ logger = Log(__name__).getlog()
 
 
 class U2STTrainer(Trainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # training config
-        default = CfgNode(
-            dict(
-                n_epoch=50,  # train epochs
-                log_interval=100,  # steps
-                accum_grad=1,  # accum grad by # steps
-                global_grad_clip=5.0,  # the global norm clip
-            ))
-        default.optim = 'adam'
-        default.optim_conf = CfgNode(
-            dict(
-                lr=5e-4,  # learning rate
-                weight_decay=1e-6,  # the coeff of weight decay
-            ))
-        default.scheduler = 'warmuplr'
-        default.scheduler_conf = CfgNode(
-            dict(
-                warmup_steps=25000,
-                lr_decay=1.0,  # learning rate decay
-            ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
         super().__init__(config, args)
 
     def train_batch(self, batch_index, batch_data, msg):
-        train_conf = self.config.training
+        train_conf = self.config
         start = time.time()
         # forward
         utt, audio, audio_len, text, text_len = batch_data
@@ -127,13 +100,13 @@ class U2STTrainer(Trainer):
 
         for k, v in losses_np.items():
             report(k, v)
-        report("batch_size", self.config.collator.batch_size)
+        report("batch_size", self.config.batch_size)
         report("accum", train_conf.accum_grad)
         report("step_cost", iteration_time)
 
         if (batch_index + 1) % train_conf.log_interval == 0:
             msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.collator.batch_size)
+            msg += "batch size: {}, ".format(self.config.batch_size)
             msg += "accum: {}, ".format(train_conf.accum_grad)
             msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_np.items())
@@ -174,7 +147,7 @@ class U2STTrainer(Trainer):
                 if ctc_loss:
                     valid_losses['val_ctc_loss'].append(float(ctc_loss))
 
-            if (i + 1) % self.config.training.log_interval == 0:
+            if (i + 1) % self.config.log_interval == 0:
                 valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
                 valid_dump['val_history_st_loss'] = total_loss / num_seen_utts
 
@@ -203,7 +176,7 @@ class U2STTrainer(Trainer):
         self.before_train()
 
         logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
-        while self.epoch < self.config.training.n_epoch:
+        while self.epoch < self.config.n_epoch:
             with Timer("Epoch-Train Time Cost: {}"):
                 self.model.train()
                 try:
@@ -236,7 +209,7 @@ class U2STTrainer(Trainer):
                             msg += ","
                         msg = msg[:-1]  # remove the last ","
                         if (batch_index + 1
-                            ) % self.config.training.log_interval == 0:
+                            ) % self.config.log_interval == 0:
                             logger.info(msg)
                 except Exception as e:
                     logger.error(e)
@@ -269,17 +242,17 @@ class U2STTrainer(Trainer):
     def setup_dataloader(self):
         config = self.config.clone()
 
-        load_transcript = True if config.model.model_conf.asr_weight > 0 else False
+        load_transcript = True if config.model_conf.asr_weight > 0 else False
 
         if self.train:
             # train/valid dataset, return token ids
             self.train_loader = BatchDataLoader(
-                json_file=config.data.train_manifest,
+                json_file=config.train_manifest,
                 train_mode=True,
                 sortagrad=False,
-                batch_size=config.collator.batch_size,
-                maxlen_in=config.collator.maxlen_in,
-                maxlen_out=config.collator.maxlen_out,
+                batch_size=config.batch_size,
+                maxlen_in=config.maxlen_in,
+                maxlen_out=config.maxlen_out,
                 minibatches=0,
                 mini_batch_size=1,
                 batch_count='auto',
@@ -287,19 +260,18 @@ class U2STTrainer(Trainer):
                 batch_frames_in=0,
                 batch_frames_out=0,
                 batch_frames_inout=0,
-                preprocess_conf=config.collator.
-                augmentation_config,  # aug will be off when train_mode=False
-                n_iter_processes=config.collator.num_workers,
+                preprocess_conf=config.preprocess_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.num_workers,
                 subsampling_factor=1,
                 load_aux_output=load_transcript,
                 num_encs=1,
                 dist_sampler=True)
 
             self.valid_loader = BatchDataLoader(
-                json_file=config.data.dev_manifest,
+                json_file=config.dev_manifest,
                 train_mode=False,
                 sortagrad=False,
-                batch_size=config.collator.batch_size,
+                batch_size=config.batch_size,
                 maxlen_in=float('inf'),
                 maxlen_out=float('inf'),
                 minibatches=0,
@@ -309,9 +281,8 @@ class U2STTrainer(Trainer):
                 batch_frames_in=0,
                 batch_frames_out=0,
                 batch_frames_inout=0,
-                preprocess_conf=config.collator.
-                augmentation_config,  # aug will be off when train_mode=False
-                n_iter_processes=config.collator.num_workers,
+                preprocess_conf=config.preprocess_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.num_workers,
                 subsampling_factor=1,
                 load_aux_output=load_transcript,
                 num_encs=1,
@@ -319,11 +290,12 @@ class U2STTrainer(Trainer):
             logger.info("Setup train/valid Dataloader!")
         else:
             # test dataset, return raw text
+            decode_batch_size = config.get('decode',dict()).get('decode_batch_size', 1)
             self.test_loader = BatchDataLoader(
-                json_file=config.data.test_manifest,
+                json_file=config.test_manifest,
                 train_mode=False,
                 sortagrad=False,
-                batch_size=config.decoding.batch_size,
+                batch_size=decode_batch_size,
                 maxlen_in=float('inf'),
                 maxlen_out=float('inf'),
                 minibatches=0,
@@ -333,9 +305,8 @@ class U2STTrainer(Trainer):
                 batch_frames_in=0,
                 batch_frames_out=0,
                 batch_frames_inout=0,
-                preprocess_conf=config.collator.
-                augmentation_config,  # aug will be off when train_mode=False
-                n_iter_processes=config.collator.num_workers,
+                preprocess_conf=config.preprocess_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.num_workers,
                 subsampling_factor=1,
                 num_encs=1,
                 dist_sampler=False)
@@ -344,7 +315,7 @@ class U2STTrainer(Trainer):
 
     def setup_model(self):
         config = self.config
-        model_conf = config.model
+        model_conf = config
         with UpdateConfig(model_conf):
             if self.train:
                 model_conf.input_dim = self.train_loader.feat_dim
@@ -361,7 +332,7 @@ class U2STTrainer(Trainer):
         logger.info(f"{model}")
         layer_tools.print_params(model, logger.info)
 
-        train_config = config.training
+        train_config = config
         optim_type = train_config.optim
         optim_conf = train_config.optim_conf
         scheduler_type = train_config.scheduler
@@ -381,7 +352,7 @@ class U2STTrainer(Trainer):
                 config,
                 parameters,
                 lr_scheduler=None, ):
-            train_config = config.training
+            train_config = config
             optim_type = train_config.optim
             optim_conf = train_config.optim_conf
             scheduler_type = train_config.scheduler
@@ -407,41 +378,12 @@ class U2STTrainer(Trainer):
 
 
 class U2STTester(U2STTrainer):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # decoding config
-        default = CfgNode(
-            dict(
-                alpha=2.5,  # Coef of LM for beam search.
-                beta=0.3,  # Coef of WC for beam search.
-                cutoff_prob=1.0,  # Cutoff probability for pruning.
-                cutoff_top_n=40,  # Cutoff number for pruning.
-                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-                decoding_method='attention',  # Decoding method. Options: 'attention', 'ctc_greedy_search',
-                # 'ctc_prefix_beam_search', 'attention_rescoring'
-                error_rate_type='bleu',  # Error rate type for evaluation. Options `bleu`, 'char_bleu'
-                num_proc_bsearch=8,  # # of CPUs for beam search.
-                beam_size=10,  # Beam search width.
-                batch_size=16,  # decoding batch size
-                ctc_weight=0.0,  # ctc weight for attention rescoring decode mode.
-                decoding_chunk_size=-1,  # decoding chunk size. Defaults to -1.
-                # <0: for decoding, use full chunk.
-                # >0: for decoding, use fixed chunk size as set.
-                # 0: used for training, it's prohibited here.
-                num_decoding_left_chunks=-1,  # number of left chunks for decoding. Defaults to -1.
-                simulate_streaming=False,  # simulate streaming inference. Defaults to False.
-            ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self, config, args):
         super().__init__(config, args)
         self.text_feature = TextFeaturizer(
-            unit_type=self.config.collator.unit_type,
-            vocab_filepath=self.config.collator.vocab_filepath,
-            spm_model_prefix=self.config.collator.spm_model_prefix)
+            unit_type=self.config.unit_type,
+            vocab=self.config.vocab_filepath,
+            spm_model_prefix=self.config.spm_model_prefix)
         self.vocab_list = self.text_feature.vocab_list
 
     def id2token(self, texts, texts_len, text_feature):
@@ -455,19 +397,19 @@ class U2STTester(U2STTrainer):
 
     def translate(self, audio, audio_len):
         """"E2E translation from extracted audio feature"""
-        cfg = self.config.decoding
+        decode_cfg = self.config.decode
         self.model.eval()
 
         hyps = self.model.decode(
             audio,
             audio_len,
             text_feature=self.text_feature,
-            decoding_method=cfg.decoding_method,
-            beam_size=cfg.beam_size,
-            word_reward=cfg.word_reward,
-            decoding_chunk_size=cfg.decoding_chunk_size,
-            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
-            simulate_streaming=cfg.simulate_streaming)
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            word_reward=decode_cfg.word_reward,
+            decoding_chunk_size=decode_cfg.decoding_chunk_size,
+            num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
+            simulate_streaming=decode_cfg.simulate_streaming)
         return hyps
 
     def compute_translation_metrics(self,
@@ -478,7 +420,7 @@ class U2STTester(U2STTrainer):
                                     texts_len,
                                     bleu_func,
                                     fout=None):
-        cfg = self.config.decoding
+        decode_cfg = self.config.decode
         len_refs, num_ins = 0, 0
 
         start_time = time.time()
@@ -489,12 +431,12 @@ class U2STTester(U2STTrainer):
             audio,
             audio_len,
             text_feature=self.text_feature,
-            decoding_method=cfg.decoding_method,
-            beam_size=cfg.beam_size,
-            word_reward=cfg.word_reward,
-            decoding_chunk_size=cfg.decoding_chunk_size,
-            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
-            simulate_streaming=cfg.simulate_streaming)
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            word_reward=decode_cfg.word_reward,
+            decoding_chunk_size=decode_cfg.decoding_chunk_size,
+            num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
+            simulate_streaming=decode_cfg.simulate_streaming)
 
         decode_time = time.time() - start_time
 
@@ -525,10 +467,10 @@ class U2STTester(U2STTrainer):
         self.model.eval()
         logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
 
-        cfg = self.config.decoding
-        bleu_func = bleu_score.char_bleu if cfg.error_rate_type == 'char-bleu' else bleu_score.bleu
+        decode_cfg = self.config.decode
+        bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu
 
-        stride_ms = self.config.collator.stride_ms
+        stride_ms = self.config.stride_ms
         hyps, refs = [], []
         len_refs, num_ins = 0, 0
         num_frames = 0.0
@@ -573,7 +515,7 @@ class U2STTester(U2STTrainer):
                 "num_examples":
                 num_ins,
                 "decode_method":
-                self.config.decoding.decoding_method,
+                self.config.decode.decoding_method,
             })
             f.write(data + '\n')
 
@@ -586,7 +528,7 @@ class U2STTester(U2STTrainer):
         """
         from paddlespeech.s2t.models.u2_st import U2STInferModel
         infer_model = U2STInferModel.from_pretrained(self.test_loader,
-                                                     self.config.model.clone(),
+                                                     self.config.clone(),
                                                      self.args.checkpoint_path)
         feat_dim = self.test_loader.feat_dim
         input_spec = [
diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py
index 5f233549..3a14b2d5 100644
--- a/paddlespeech/s2t/io/collator.py
+++ b/paddlespeech/s2t/io/collator.py
@@ -219,33 +219,6 @@ class SpeechCollatorBase():
 
 
 class SpeechCollator(SpeechCollatorBase):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                augmentation_config="",
-                random_seed=0,
-                mean_std_filepath="",
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                spectrum_type='linear',  # 'linear', 'mfcc', 'fbank'
-                feat_dim=0,  # 'mfcc', 'fbank'
-                delta_delta=False,  # 'mfcc', 'fbank'
-                stride_ms=10.0,  # ms
-                window_ms=20.0,  # ms
-                n_fft=None,  # fft points
-                max_freq=None,  # None for samplerate/2
-                target_sample_rate=16000,  # target sample rate
-                use_dB_normalization=True,
-                target_dB=-20,
-                dither=1.0,  # feature dither
-                keep_transcription_text=False))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     @classmethod
     def from_config(cls, config):
         """Build a SpeechCollator object from a config.
@@ -256,45 +229,43 @@ class SpeechCollator(SpeechCollatorBase):
         Returns:
             SpeechCollator: collator object.
         """
-        assert 'augmentation_config' in config.collator
-        assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.collator
-        assert 'vocab_filepath' in config.collator
-        assert 'spectrum_type' in config.collator
-        assert 'n_fft' in config.collator
-        assert config.collator
-
-        if isinstance(config.collator.augmentation_config, (str, bytes)):
-            if config.collator.augmentation_config:
+        assert 'augmentation_config' in config
+        assert 'keep_transcription_text' in config
+        assert 'mean_std_filepath' in config
+        assert 'vocab_filepath' in config
+        assert 'spectrum_type' in config
+        assert 'n_fft' in config
+        assert config
+
+        if isinstance(config.augmentation_config, (str, bytes)):
+            if config.augmentation_config:
                 aug_file = io.open(
-                    config.collator.augmentation_config,
-                    mode='r',
-                    encoding='utf8')
+                    config.augmentation_config, mode='r', encoding='utf8')
             else:
                 aug_file = io.StringIO(initial_value='{}', newline='')
         else:
-            aug_file = config.collator.augmentation_config
+            aug_file = config.augmentation_config
             assert isinstance(aug_file, io.StringIO)
 
         speech_collator = cls(
             aug_file=aug_file,
             random_seed=0,
-            mean_std_filepath=config.collator.mean_std_filepath,
-            unit_type=config.collator.unit_type,
-            vocab_filepath=config.collator.vocab_filepath,
-            spm_model_prefix=config.collator.spm_model_prefix,
-            spectrum_type=config.collator.spectrum_type,
-            feat_dim=config.collator.feat_dim,
-            delta_delta=config.collator.delta_delta,
-            stride_ms=config.collator.stride_ms,
-            window_ms=config.collator.window_ms,
-            n_fft=config.collator.n_fft,
-            max_freq=config.collator.max_freq,
-            target_sample_rate=config.collator.target_sample_rate,
-            use_dB_normalization=config.collator.use_dB_normalization,
-            target_dB=config.collator.target_dB,
-            dither=config.collator.dither,
-            keep_transcription_text=config.collator.keep_transcription_text)
+            mean_std_filepath=config.mean_std_filepath,
+            unit_type=config.unit_type,
+            vocab_filepath=config.vocab_filepath,
+            spm_model_prefix=config.spm_model_prefix,
+            spectrum_type=config.spectrum_type,
+            feat_dim=config.feat_dim,
+            delta_delta=config.delta_delta,
+            stride_ms=config.stride_ms,
+            window_ms=config.window_ms,
+            n_fft=config.n_fft,
+            max_freq=config.max_freq,
+            target_sample_rate=config.target_sample_rate,
+            use_dB_normalization=config.use_dB_normalization,
+            target_dB=config.target_dB,
+            dither=config.dither,
+            keep_transcription_text=config.keep_transcription_text)
         return speech_collator
 
 
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index d64d7d3e..9149fb27 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -28,22 +28,6 @@ logger = Log(__name__).getlog()
 
 
 class ManifestDataset(Dataset):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                manifest="",
-                max_input_len=27.0,
-                min_input_len=0.0,
-                max_output_len=float('inf'),
-                min_output_len=0.0,
-                max_output_input_ratio=float('inf'),
-                min_output_input_ratio=0.0, ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     @classmethod
     def from_config(cls, config):
         """Build a ManifestDataset object from a config.
@@ -54,17 +38,17 @@ class ManifestDataset(Dataset):
         Returns:
             ManifestDataset: dataet object.
         """
-        assert 'manifest' in config.data
-        assert config.data.manifest
+        assert 'manifest' in config
+        assert config.manifest
 
         dataset = cls(
-            manifest_path=config.data.manifest,
-            max_input_len=config.data.max_input_len,
-            min_input_len=config.data.min_input_len,
-            max_output_len=config.data.max_output_len,
-            min_output_len=config.data.min_output_len,
-            max_output_input_ratio=config.data.max_output_input_ratio,
-            min_output_input_ratio=config.data.min_output_input_ratio, )
+            manifest_path=config.manifest,
+            max_input_len=config.max_input_len,
+            min_input_len=config.min_input_len,
+            max_output_len=config.max_output_len,
+            min_output_len=config.min_output_len,
+            max_output_input_ratio=config.max_output_input_ratio,
+            min_output_input_ratio=config.min_output_input_ratio, )
         return dataset
 
     def __init__(self,
diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
index 0dfaec29..ddc3612d 100644
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -119,21 +119,6 @@ class DeepSpeech2Model(nn.Layer):
              before softmax) and a ctc cost layer.
     :rtype: tuple of LayerOutput
     """
-
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                num_conv_layers=2,  #Number of stacking convolution layers.
-                num_rnn_layers=3,  #Number of stacking RNN layers.
-                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-                share_rnn_weights=True,  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-                ctc_grad_norm_type=None, ))
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self,
                  feat_size,
                  dict_size,
@@ -221,12 +206,12 @@ class DeepSpeech2Model(nn.Layer):
         model = cls(
             feat_size=dataloader.collate_fn.feature_size,
             dict_size=dataloader.collate_fn.vocab_size,
-            num_conv_layers=config.model.num_conv_layers,
-            num_rnn_layers=config.model.num_rnn_layers,
-            rnn_size=config.model.rnn_layer_size,
-            use_gru=config.model.use_gru,
-            share_rnn_weights=config.model.share_rnn_weights,
-            blank_id=config.model.blank_id,
+            num_conv_layers=config.num_conv_layers,
+            num_rnn_layers=config.num_rnn_layers,
+            rnn_size=config.rnn_layer_size,
+            use_gru=config.use_gru,
+            share_rnn_weights=config.share_rnn_weights,
+            blank_id=config.blank_id,
             ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         infos = Checkpoint().load_parameters(
             model, checkpoint_path=checkpoint_path)
@@ -240,7 +225,7 @@ class DeepSpeech2Model(nn.Layer):
         Parameters
 
         config: yacs.config.CfgNode
-            config.model
+            config
         Returns
         -------
         DeepSpeech2Model
diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
index 85876bce..aae77f74 100644
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@@ -243,23 +243,6 @@ class DeepSpeech2ModelOnline(nn.Layer):
              before softmax) and a ctc cost layer.
     :rtype: tuple of LayerOutput
     """
-
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                num_conv_layers=2,  #Number of stacking convolution layers.
-                num_rnn_layers=4,  #Number of stacking RNN layers.
-                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-                num_fc_layers=2,
-                fc_layers_size_list=[512, 256],
-                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-                blank_id=0,  # index of blank in vocob.txt
-                ctc_grad_norm_type=None, ))
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(
             self,
             feat_size,
@@ -353,14 +336,14 @@ class DeepSpeech2ModelOnline(nn.Layer):
         model = cls(
             feat_size=dataloader.collate_fn.feature_size,
             dict_size=dataloader.collate_fn.vocab_size,
-            num_conv_layers=config.model.num_conv_layers,
-            num_rnn_layers=config.model.num_rnn_layers,
-            rnn_size=config.model.rnn_layer_size,
-            rnn_direction=config.model.rnn_direction,
-            num_fc_layers=config.model.num_fc_layers,
-            fc_layers_size_list=config.model.fc_layers_size_list,
-            use_gru=config.model.use_gru,
-            blank_id=config.model.blank_id,
+            num_conv_layers=config.num_conv_layers,
+            num_rnn_layers=config.num_rnn_layers,
+            rnn_size=config.rnn_layer_size,
+            rnn_direction=config.rnn_direction,
+            num_fc_layers=config.num_fc_layers,
+            fc_layers_size_list=config.fc_layers_size_list,
+            use_gru=config.use_gru,
+            blank_id=config.blank_id,
             ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
         infos = Checkpoint().load_parameters(
             model, checkpoint_path=checkpoint_path)
@@ -374,7 +357,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
         Parameters
 
         config: yacs.config.CfgNode
-            config.model
+            config
         Returns
         -------
         DeepSpeech2ModelOnline
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 83eff467..26e81acf 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -59,57 +59,6 @@ logger = Log(__name__).getlog()
 
 class U2BaseModel(ASRInterface, nn.Layer):
     """CTC-Attention hybrid Encoder-Decoder model"""
-
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # network architecture
-        default = CfgNode()
-        # allow add new item when merge_with_file
-        default.cmvn_file = ""
-        default.cmvn_file_type = "json"
-        default.input_dim = 0
-        default.output_dim = 0
-        # encoder related
-        default.encoder = 'transformer'
-        default.encoder_conf = CfgNode(
-            dict(
-                output_size=256,  # dimension of attention
-                attention_heads=4,
-                linear_units=2048,  # the number of units of position-wise feed forward
-                num_blocks=12,  # the number of encoder blocks
-                dropout_rate=0.1,
-                positional_dropout_rate=0.1,
-                attention_dropout_rate=0.0,
-                input_layer='conv2d',  # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-                normalize_before=True,
-                # use_cnn_module=True,
-                # cnn_module_kernel=15,
-                # activation_type='swish',
-                # pos_enc_layer_type='rel_pos',
-                # selfattention_layer_type='rel_selfattn',
-            ))
-        # decoder related
-        default.decoder = 'transformer'
-        default.decoder_conf = CfgNode(
-            dict(
-                attention_heads=4,
-                linear_units=2048,
-                num_blocks=6,
-                dropout_rate=0.1,
-                positional_dropout_rate=0.1,
-                self_attention_dropout_rate=0.0,
-                src_attention_dropout_rate=0.0, ))
-        # hybrid CTC/attention
-        default.model_conf = CfgNode(
-            dict(
-                ctc_weight=0.3,
-                lsm_weight=0.1,  # label smoothing option
-                length_normalized_loss=False, ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self,
                  vocab_size: int,
                  encoder: TransformerEncoder,
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 8b07e389..1c5596ba 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """U2 ASR Model
-Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition 
+Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
 (https://arxiv.org/pdf/2012.05481.pdf)
 """
 import time
@@ -51,58 +51,6 @@ logger = Log(__name__).getlog()
 
 class U2STBaseModel(nn.Layer):
     """CTC-Attention hybrid Encoder-Decoder model"""
-
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        # network architecture
-        default = CfgNode()
-        # allow add new item when merge_with_file
-        default.cmvn_file = ""
-        default.cmvn_file_type = "json"
-        default.input_dim = 0
-        default.output_dim = 0
-        # encoder related
-        default.encoder = 'transformer'
-        default.encoder_conf = CfgNode(
-            dict(
-                output_size=256,  # dimension of attention
-                attention_heads=4,
-                linear_units=2048,  # the number of units of position-wise feed forward
-                num_blocks=12,  # the number of encoder blocks
-                dropout_rate=0.1,
-                positional_dropout_rate=0.1,
-                attention_dropout_rate=0.0,
-                input_layer='conv2d',  # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-                normalize_before=True,
-                # use_cnn_module=True,
-                # cnn_module_kernel=15,
-                # activation_type='swish',
-                # pos_enc_layer_type='rel_pos',
-                # selfattention_layer_type='rel_selfattn', 
-            ))
-        # decoder related
-        default.decoder = 'transformer'
-        default.decoder_conf = CfgNode(
-            dict(
-                attention_heads=4,
-                linear_units=2048,
-                num_blocks=6,
-                dropout_rate=0.1,
-                positional_dropout_rate=0.1,
-                self_attention_dropout_rate=0.0,
-                src_attention_dropout_rate=0.0, ))
-        # hybrid CTC/attention
-        default.model_conf = CfgNode(
-            dict(
-                asr_weight=0.0,
-                ctc_weight=0.0,
-                lsm_weight=0.1,  # label smoothing option
-                length_normalized_loss=False, ))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
     def __init__(self,
                  vocab_size: int,
                  encoder: TransformerEncoder,
@@ -289,8 +237,8 @@ class U2STBaseModel(nn.Layer):
             simulate_streaming (bool, optional): streaming or not. Defaults to False.
 
         Returns:
-            Tuple[paddle.Tensor, paddle.Tensor]: 
-                encoder hiddens (B, Tmax, D), 
+            Tuple[paddle.Tensor, paddle.Tensor]:
+                encoder hiddens (B, Tmax, D),
                 encoder hiddens mask (B, 1, Tmax).
         """
         # Let's assume B = batch_size
@@ -533,21 +481,21 @@ class U2STBaseModel(nn.Layer):
             feats (Tenosr): audio features, (B, T, D)
             feats_lengths (Tenosr): (B)
             text_feature (TextFeaturizer): text feature object.
-            decoding_method (str): decoding mode, e.g. 
-                    'fullsentence', 
+            decoding_method (str): decoding mode, e.g.
+                    'fullsentence',
                     'simultaneous'
             beam_size (int): beam size for search
             decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1.
                     <0: for decoding, use full chunk.
                     >0: for decoding, use fixed chunk size as set.
-                    0: used for training, it's prohibited here. 
-            num_decoding_left_chunks (int, optional): 
+                    0: used for training, it's prohibited here.
+            num_decoding_left_chunks (int, optional):
                     number of left chunks for decoding. Defaults to -1.
             simulate_streaming (bool, optional): simulate streaming inference. Defaults to False.
 
         Raises:
             ValueError: when not support decoding_method.
-        
+
         Returns:
             List[List[int]]: transcripts.
         """
@@ -601,7 +549,7 @@ class U2STModel(U2STBaseModel):
             ValueError: raise when using not support encoder type.
 
         Returns:
-            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc 
+            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
         """
         if configs['cmvn_file'] is not None:
             mean, istd = load_cmvn(configs['cmvn_file'],
diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py
index 3ef871c5..bb85732a 100644
--- a/paddlespeech/s2t/training/cli.py
+++ b/paddlespeech/s2t/training/cli.py
@@ -97,6 +97,14 @@ def default_argument_parser(parser=None):
     train_group.add_argument(
         "--dump-config", metavar="FILE", help="dump config to `this` file.")
 
+    test_group = parser.add_argument_group(
+        title='Test Options', description=None)
+
+    test_group.add_argument(
+        "--decode_cfg",
+        metavar="DECODE_CONFIG_FILE",
+        help="decode config file.")
+
     profile_group = parser.add_argument_group(
         title='Benchmark Options', description=None)
     profile_group.add_argument(
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index 9bf1ca4d..4b2011ec 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -117,8 +117,8 @@ class Trainer():
             self.init_parallel()
 
         self.checkpoint = Checkpoint(
-            kbest_n=self.config.training.checkpoint.kbest_n,
-            latest_n=self.config.training.checkpoint.latest_n)
+            kbest_n=self.config.checkpoint.kbest_n,
+            latest_n=self.config.checkpoint.latest_n)
 
         # set random seed if needed
         if args.seed:
@@ -129,8 +129,8 @@ class Trainer():
         if hasattr(self.args,
                    "benchmark_batch_size") and self.args.benchmark_batch_size:
             with UpdateConfig(self.config):
-                self.config.collator.batch_size = self.args.benchmark_batch_size
-                self.config.training.log_interval = 1
+                self.config.batch_size = self.args.benchmark_batch_size
+                self.config.log_interval = 1
             logger.info(
                 f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
 
@@ -260,7 +260,7 @@ class Trainer():
         self.before_train()
 
         logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
-        while self.epoch < self.config.training.n_epoch:
+        while self.epoch < self.config.n_epoch:
             with Timer("Epoch-Train Time Cost: {}"):
                 self.model.train()
                 try:
diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py
index 73c79816..dc1be815 100644
--- a/paddlespeech/s2t/utils/utility.py
+++ b/paddlespeech/s2t/utils/utility.py
@@ -130,7 +130,7 @@ def get_subsample(config):
     Returns:
         int: subsample rate.
     """
-    input_layer = config["model"]["encoder_conf"]["input_layer"]
+    input_layer = config["encoder_conf"]["input_layer"]
     assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
     if input_layer == "conv2d":
         return 4
diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh
index fcd0c235..c9d640ed 100644
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
@@ -22,6 +22,7 @@ sed -i "s/  accum_grad: 2/  accum_grad: 1/g" conf/benchmark/conformer.yaml
 fp_item_list=(fp32)
 bs_item=(16)
 config_path=conf/benchmark/conformer.yaml
+decode_config_path=conf/tuning/decode.yaml
 seed=0
 output=exp/conformer
 profiler_options=None
@@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do
         echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer"
         run_mode=mp
         ngpu=8
-        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
+        CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
         sleep 60
         log_name=speech_${model_item}_bs${bs_item}_${fp_item}   # 如:clas_MobileNetv1_mp_bs32_fp32_8
         echo "index is speed, 1gpus, begin, ${log_name}"
         run_mode=sp
         ngpu=1
-        CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1   #  (5min)
+        CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1   #  (5min)
         sleep 60
     done
 done
diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh
index 5b83b15c..16cd410e 100644
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -5,13 +5,14 @@ function _set_params(){
 
     run_mode=${1:-"sp"}          # 单卡sp|多卡mp
     config_path=${2:-"conf/conformer.yaml"}
-    output=${3:-"exp/conformer"}
-    seed=${4:-"0"}
-    ngpu=${5:-"1"}
-    profiler_options=${6:-"None"}
-    batch_size=${7:-"32"}
-    fp_item=${8:-"fp32"}
-    model_item=${9:-"conformer"}
+    decode_config_path=${3:-"conf/tuning/decode.yaml"}
+    output=${4:-"exp/conformer"}
+    seed=${5:-"0"}
+    ngpu=${6:-"1"}
+    profiler_options=${7:-"None"}
+    batch_size=${8:-"32"}
+    fp_item=${9:-"fp32"}
+    model_item=${10:-"conformer"}
     benchmark_max_step=0
     run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR 后续QA设置该参数
 # 添加日志解析需要的参数
@@ -35,6 +36,7 @@ function _train(){
     echo "Train on ${num_gpu_devices} GPUs"
     echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
     train_cmd="--config=${config_path} \
+           --decode_cfg=${decode_config_path} \
            --output=${output} \
            --seed=${seed} \
            --ngpu=${ngpu} \
@@ -69,6 +71,6 @@ function _train(){
 
 source ${BENCHMARK_ROOT}/scripts/run_model.sh   # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
 _set_params $@
-# _train       # 如果只想产出训练log,不解析,可取消注释
+#_train       # 如果只想产出训练log,不解析,可取消注释
 _run     # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
 
diff --git a/tests/chains/ds2/ds2_params_lite_train_infer.txt b/tests/chains/ds2/ds2_params_lite_train_infer.txt
index b11872bd..cad8efa3 100644
--- a/tests/chains/ds2/ds2_params_lite_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt
@@ -21,13 +21,13 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl  --model_type offline
+eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --result_file tests/4.rsl  --model_type offline
 null:null
 ##
 ===========================infer_params===========================
 null:null
 null:null
-norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit
+norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --export_path exp/deepspeech_tiny/checkpoints/4.jit
 quant_export:null
 fpgm_export:null
 distill_export:null
diff --git a/tests/chains/ds2/ds2_params_whole_train_infer.txt b/tests/chains/ds2/ds2_params_whole_train_infer.txt
index 875e3ccf..5c619506 100644
--- a/tests/chains/ds2/ds2_params_whole_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
+eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline
 null:null
 ##
 ===========================infer_params===========================
diff --git a/tests/chains/ds2/lite_train_infer.sh b/tests/chains/ds2/lite_train_infer.sh
index 76b22a38..1dce1b29 100644
--- a/tests/chains/ds2/lite_train_infer.sh
+++ b/tests/chains/ds2/lite_train_infer.sh
@@ -1,5 +1,5 @@
 bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer
-cd ../../examples/tiny/s0
+cd ../../../examples/tiny/asr0
 source path.sh
-bash ../../../tests/chains/test.sh ../../../tests/chains/ds2_params_lite_train_infer.txt lite_train_infer
+bash ../../../tests/chains/ds2/test.sh ../../../tests/chains/ds2/ds2_params_lite_train_infer.txt lite_train_infer
 cd ../../../tests/chains
diff --git a/tests/chains/ds2/prepare.sh b/tests/chains/ds2/prepare.sh
index 73a30283..4913ce42 100644
--- a/tests/chains/ds2/prepare.sh
+++ b/tests/chains/ds2/prepare.sh
@@ -34,7 +34,7 @@ MODE=$2
 if [ ${MODE} = "lite_train_infer" ];then
     # pretrain lite train data
     curPath=$(readlink -f "$(dirname "$0")")
-    cd ${curPath}/../../examples/tiny/s0
+    cd ${curPath}/../../../examples/tiny/asr0
     source path.sh
     # download audio data
     bash ./local/data.sh || exit -1
@@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then
 
 elif [ ${MODE} = "whole_train_infer" ];then
     curPath=$(readlink -f "$(dirname "$0")")
-    cd ${curPath}/../../examples/aishell/s0
+    cd ${curPath}/../../../examples/aishell/asr0
     source path.sh
     # download audio data
     bash ./local/data.sh || exit -1
@@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then
     cd ${curPath}
 elif [ ${MODE} = "whole_infer" ];then
     curPath=$(readlink -f "$(dirname "$0")")
-    cd ${curPath}/../../examples/aishell/s0
+    cd ${curPath}/../../../examples/aishell/asr0
     source path.sh
     # download audio data
     bash ./local/data.sh || exit -1
@@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then
     cd ${curPath}
 else
     curPath=$(readlink -f "$(dirname "$0")")
-    cd ${curPath}/../../examples/aishell/s0
+    cd ${curPath}/../../../examples/aishell/asr0
     source path.sh
     # download audio data
     bash ./local/data.sh || exit -1
diff --git a/tests/chains/ds2/test.sh b/tests/chains/ds2/test.sh
index c9307820..26917c67 100644
--- a/tests/chains/ds2/test.sh
+++ b/tests/chains/ds2/test.sh
@@ -324,6 +324,7 @@ else
                     gsu=${gpu//,/ }
                     nump=`echo $gsu | wc -w`
                     cmd="${python} ${run_train} --ngpu=$nump"
+                    export CUDA_VISIBLE_DEVICES=${gpu}
                 else     # train with multi-machine
                     cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}"
                 fi