Merge pull request #1225 from Jackwaterveg/new_config

[ASR][Config]refactor the train and test config
4 years ago · 4cab9f625b
parent 03f8accd07 455bf477a4
commit 4cab9f625b
140 changed files with 3266 additions and 3792 deletions
--- a/examples/aishell/asr0/conf/deepspeech2.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2.yaml
@ -1,68 +1,64 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
-  min_input_len: 0.0
+dev_manifest: data/manifest.dev
-  max_input_len: 27.0 # second
+test_manifest: data/manifest.test
-  min_output_len: 0.0
+min_input_len: 0.0
-  max_output_len: .inf
+max_input_len: 27.0 # second
-  min_output_input_ratio: 0.00
+min_output_len: 0.0
-  max_output_input_ratio: .inf
+max_output_len: .inf
 min_output_input_ratio: 0.00
 max_output_input_ratio: .inf
-collator:
+###########################################
-  batch_size: 64 # one gpu
+#              Dataloader                 #
-  mean_std_filepath: data/mean_std.json
+###########################################
-  unit_type: char
+batch_size: 64 # one gpu
-  vocab_filepath: data/lang_char/vocab.txt 
+mean_std_filepath: data/mean_std.json
-  augmentation_config: conf/augmentation.json
+unit_type: char
-  random_seed: 0
+vocab_filepath: data/lang_char/vocab.txt 
-  spm_model_prefix: 
+augmentation_config: conf/augmentation.json
-  spectrum_type: linear
+random_seed: 0
-  feat_dim: 
+spm_model_prefix: 
-  delta_delta: False
+spectrum_type: linear
-  stride_ms: 10.0
+feat_dim: 
-  window_ms: 20.0
+delta_delta: False
-  n_fft: None
+stride_ms: 10.0
-  max_freq: None
+window_ms: 20.0
-  target_sample_rate: 16000
+n_fft: None
-  use_dB_normalization: True
+max_freq: None
-  target_dB: -20
+target_sample_rate: 16000
-  dither: 1.0
+use_dB_normalization: True
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True
+dither: 1.0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 2
+sortagrad: True
 shuffle_method: batch_shuffle
 num_workers: 2
-model:
+############################################
-  num_conv_layers: 2
+#           Network Architecture           #
-  num_rnn_layers: 3
+############################################
-  rnn_layer_size: 1024
+num_conv_layers: 2
-  use_gru: True 
+num_rnn_layers: 3
-  share_rnn_weights: False
+rnn_layer_size: 1024
-  blank_id: 0
+use_gru: True 
-  ctc_grad_norm_type: instance 
+share_rnn_weights: False
 blank_id: 0
 ctc_grad_norm_type: instance 
-training:
+###########################################
-  n_epoch: 80
+#                Training                 #
-  accum_grad: 1
+###########################################
-  lr: 2e-3
+n_epoch: 80
-  lr_decay: 0.83
+accum_grad: 1
-  weight_decay: 1e-06
+lr: 2e-3
-  global_grad_clip: 3.0
+lr_decay: 0.83
-  log_interval: 100
+weight_decay: 1e-06
-  checkpoint:
+global_grad_clip: 3.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 128
  error_rate_type: cer 
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
  alpha: 1.9
  beta: 5.0
  beam_size: 300
  cutoff_prob: 0.99
  cutoff_top_n: 40
  num_proc_bsearch: 10
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
@ -1,70 +1,68 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
-  min_input_len: 0.0
+dev_manifest: data/manifest.dev
-  max_input_len: 27.0 # second
+test_manifest: data/manifest.test
-  min_output_len: 0.0
+min_input_len: 0.0
-  max_output_len: .inf
+max_input_len: 27.0 # second
-  min_output_input_ratio: 0.00
+min_output_len: 0.0
-  max_output_input_ratio: .inf
+max_output_len: .inf
 min_output_input_ratio: 0.00
 max_output_input_ratio: .inf
-collator:
+###########################################
-  batch_size: 64 # one gpu
+#              Dataloader                 #
-  mean_std_filepath: data/mean_std.json
+###########################################
-  unit_type: char
+batch_size: 64 # one gpu
-  vocab_filepath: data/lang_char/vocab.txt 
+mean_std_filepath: data/mean_std.json
-  augmentation_config: conf/augmentation.json
+unit_type: char
-  random_seed: 0
+vocab_filepath: data/lang_char/vocab.txt 
-  spm_model_prefix: 
+augmentation_config: conf/augmentation.json
-  spectrum_type: linear #linear, mfcc, fbank
+random_seed: 0
-  feat_dim: 
+spm_model_prefix: 
-  delta_delta: False
+spectrum_type: linear #linear, mfcc, fbank
-  stride_ms: 10.0
+feat_dim: 
-  window_ms: 20.0
+delta_delta: False
-  n_fft: None
+stride_ms: 10.0
-  max_freq: None
+window_ms: 20.0
-  target_sample_rate: 16000
+n_fft: None
-  use_dB_normalization: True
+max_freq: None
-  target_dB: -20
+target_sample_rate: 16000
-  dither: 1.0
+use_dB_normalization: True
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True
+dither: 1.0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 0
+sortagrad: True
 shuffle_method: batch_shuffle
 num_workers: 0
-model:
+############################################
-  num_conv_layers: 2
+#           Network Architecture           #
-  num_rnn_layers: 5
+############################################
-  rnn_layer_size: 1024
+num_conv_layers: 2
-  rnn_direction: forward # [forward, bidirect]
+num_rnn_layers: 5
-  num_fc_layers: 0
+rnn_layer_size: 1024
-  fc_layers_size_list: -1,
+rnn_direction: forward # [forward, bidirect]
-  use_gru: False 
+num_fc_layers: 0
-  blank_id: 0
+fc_layers_size_list: -1,
 use_gru: False 
 blank_id: 0
-training:
+###########################################
-  n_epoch: 65
+#                Training                 #
-  accum_grad: 1
+###########################################
-  lr: 5e-4
+n_epoch: 65
-  lr_decay: 0.93
+accum_grad: 1
-  weight_decay: 1e-06
+lr: 5e-4
-  global_grad_clip: 3.0
+lr_decay: 0.93
-  log_interval: 100
+weight_decay: 1e-06
-  checkpoint:
+global_grad_clip: 3.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
-decoding:
+  
  batch_size: 32
  error_rate_type: cer 
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
  alpha: 2.2 #1.9
  beta: 4.3
  beam_size: 300
  cutoff_prob: 0.99
  cutoff_top_n: 40
  num_proc_bsearch: 10
--- a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml
+++ b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml
@ -0,0 +1,10 @@
 chunk_batch_size: 32
 error_rate_type: cer 
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
 alpha: 2.2 #1.9
 beta: 4.3
 beam_size: 300
 cutoff_prob: 0.99
 cutoff_top_n: 40
 num_proc_bsearch: 10
--- a/examples/aishell/asr0/conf/tuning/decode.yaml
+++ b/examples/aishell/asr0/conf/tuning/decode.yaml
@ -0,0 +1,10 @@
 decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
 alpha: 1.9
 beta: 5.0
 beam_size: 300
 cutoff_prob: 0.99
 cutoff_top_n: 40
 num_proc_bsearch: 10
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-model_type=$3
+ckpt_prefix=$3
 model_type=$4
 # download language model
 bash local/download_lm_ch.sh
@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
--- a/examples/aishell/asr0/local/test_export.sh
+++ b/examples/aishell/asr0/local/test_export.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-jit_model_export_path=$2
+decode_config_path=$2
-model_type=$3
+jit_model_export_path=$3
 model_type=$4
 # download language model
 bash local/download_lm_ch.sh > /dev/null 2>&1
@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test_export.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${jit_model_export_path}.rsl \
 --export_path ${jit_model_export_path} \
 --model_type ${model_type}
--- a/examples/aishell/asr0/local/test_wav.sh
+++ b/examples/aishell/asr0/local/test_wav.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 4 ];then
+if [ $# != 5 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file"
    exit -1
 fi
@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-model_type=$3
+ckpt_prefix=$3
-audio_file=$4
+model_type=$4
 audio_file=$5
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
@ -33,6 +34,7 @@ fi
 python3 -u ${BIN_DIR}/test_wav.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type} \
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline    # offline or online
 audio_file=data/demo_01_03.wav
@ -34,7 +35,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
@ -44,11 +45,11 @@ fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test export ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
 fi
--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@ -1,10 +1,11 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: conformer
+# encoder related
-    encoder_conf:
+encoder: conformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -23,10 +24,9 @@ model:
    use_dynamic_chunk: true
    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
    use_dynamic_left_chunk: false
-
+# decoder related
-    # decoder related
+decoder: transformer
-    decoder: transformer
+decoder_conf:
    decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -34,70 +34,62 @@ model:
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-
+# hybrid CTC/attention
-    # hybrid CTC/attention
+model_conf:
    model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
 ###########################################
 #                   Data                  #
 ###########################################
-data:
+train_manifest: data/manifest.train
-  train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
-  dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
  test_manifest: data/manifest.test
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt 
+#              Dataloader                 #
-  unit_type: 'char'
+###########################################
  augmentation_config: conf/preprocess.yaml
  feat_dim: 80
  stride_ms: 10.0
  window_ms: 25.0
  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
  batch_size: 64
  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
  minibatches: 0 # for debug
  batch_count: auto
  batch_bins: 0 
  batch_frames_in: 0
  batch_frames_out: 0
  batch_frames_inout: 0
  num_workers: 0
  subsampling_factor: 1
  num_encs: 1
 vocab_filepath: data/lang_char/vocab.txt 
 spm_model_prefix: ''
 unit_type: 'char'
 preprocess_config: conf/preprocess.yaml
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 64
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
-training:
+###########################################
-  n_epoch: 240 
+#                 Training                #
-  accum_grad: 2
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 240 
-  optim: adam
+accum_grad: 2
-  optim_conf:
+global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.002
-    weight_decay: 1e-6
+  weight_decay: 1.0e-6
-  scheduler: warmuplr
+scheduler: warmuplr
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 100
+log_interval: 100
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  beam_size: 10
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@ -1,10 +1,11 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: conformer
+# encoder related
-    encoder_conf:
+encoder: conformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -20,9 +21,9 @@ model:
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -31,67 +32,58 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test
 ###########################################
 #              Dataloader                 #
 ###########################################
 vocab_filepath: data/lang_char/vocab.txt 
 spm_model_prefix: ''
 unit_type: 'char'
 preprocess_config: conf/preprocess.yaml
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 64
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 2
 subsampling_factor: 1
 num_encs: 1
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt 
+#                Training                 #
-  unit_type: 'char'
+###########################################
-  augmentation_config: conf/preprocess.yaml
+n_epoch: 240 
-  feat_dim: 80
+accum_grad: 2
-  stride_ms: 10.0
+global_grad_clip: 5.0
-  window_ms: 25.0
+optim: adam
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+optim_conf:
  batch_size: 64
  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
  minibatches: 0 # for debug
  batch_count: auto
  batch_bins: 0 
  batch_frames_in: 0
  batch_frames_out: 0
  batch_frames_inout: 0
  num_workers: 8
  subsampling_factor: 1
  num_encs: 1
 training:
  n_epoch: 240 
  accum_grad: 2
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
  lr: 0.002
-    weight_decay: 1e-6
+  weight_decay: 1.0e-6
-  scheduler: warmuplr
+scheduler: warmuplr
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 100
+log_interval: 100
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  beam_size: 10
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@ -1,10 +1,11 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: transformer
+# encoder related
-    encoder_conf:
+encoder: transformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -14,10 +15,9 @@ model:
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
-
+# decoder related
-    # decoder related
+decoder: transformer
-    decoder: transformer
+decoder_conf:
    decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -26,70 +26,60 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-
+###########################################
 #                   Data                  #
 ###########################################
 # https://yaml.org/type/float.html
-data:
+train_manifest: data/manifest.train
-  train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
-  dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
  test_manifest: data/manifest.test
 collator:
  unit_type: 'char'
  vocab_filepath: data/lang_char/vocab.txt 
  feat_dim: 80
  stride_ms: 10.0
  window_ms: 25.0
  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
  batch_size: 64 
  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
  minibatches: 0 # for debug
  batch_count: auto
  batch_bins: 0 
  batch_frames_in: 0
  batch_frames_out: 0
  batch_frames_inout: 0
  augmentation_config: conf/preprocess.yaml 
  num_workers: 0
  subsampling_factor: 1
  num_encs: 1
 ###########################################
 #              Dataloader                 #
 ###########################################
 unit_type: 'char'
 vocab_filepath: data/lang_char/vocab.txt 
 spm_model_prefix: ''
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 64 
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 preprocess_config: conf/preprocess.yaml 
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
-
+###########################################
-training:
+#                 Training                #
-  n_epoch: 240 
+###########################################
-  accum_grad: 2
+n_epoch: 240 
-  global_grad_clip: 5.0
+accum_grad: 2
-  optim: adam
+global_grad_clip: 5.0
-  optim_conf:
+optim: adam
 optim_conf:
  lr: 0.002
-    weight_decay: 1e-6
+  weight_decay: 1.0e-6
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 100
+log_interval: 100
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  beam_size: 10
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml
+++ b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml
@ -0,0 +1,11 @@
 beam_size: 10
 decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr1/conf/tuning/decode.yaml
+++ b/examples/aishell/asr1/conf/tuning/decode.yaml
@ -0,0 +1,11 @@
 beam_size: 10
 decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 batch_size=1
 output_dir=${ckpt_prefix}
@ -20,9 +21,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 if [ $? -ne 0 ]; then
    echo "Failed in ctc alignment!"
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/aishell/asr1/local/test_wav.sh
+++ b/examples/aishell/asr1/local/test_wav.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-audio_file=$3
+ckpt_prefix=$3
 audio_file=$4
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
@ -42,10 +43,11 @@ for type in  attention_rescoring; do
    python3 -u ${BIN_DIR}/test_wav.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size} \
+    --opts decode.decode_batch_size ${batch_size} \
    --audio_file ${audio_file}
    if [ $? -ne 0 ]; then
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/conformer.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=20
 audio_file=data/demo_01_03.wav
@ -32,18 +33,18 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
 # Not supported at now!!!
--- a/examples/callcenter/asr1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml
@ -1,48 +1,47 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
-  min_input_len: 0.5
+dev_manifest: data/manifest.dev
-  max_input_len: 20.0 # second
+test_manifest: data/manifest.test
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt 
+#              Dataloader                 #
-  unit_type: 'char'
+###########################################
-  spm_model_prefix: ''
+vocab_filepath: data/lang_char/vocab.txt 
-  augmentation_config: conf/preprocess.yaml
+unit_type: 'char'
-  batch_size: 32
+spm_model_prefix: ''
-  raw_wav: True  # use raw_wav or kaldi feature
+preprocess_config: conf/preprocess.yaml
-  spectrum_type: fbank #linear, mfcc, fbank
+batch_size: 32
-  feat_dim: 80
+raw_wav: True  # use raw_wav or kaldi feature
-  delta_delta: False
+spectrum_type: fbank #linear, mfcc, fbank
-  dither: 1.0
+feat_dim: 80
-  target_sample_rate: 8000
+delta_delta: False
-  max_freq: None
+dither: 1.0
-  n_fft: None
+target_sample_rate: 8000
-  stride_ms: 10.0
+max_freq: None
-  window_ms: 25.0
+n_fft: None
-  use_dB_normalization: True 
+stride_ms: 10.0
-  target_dB: -20
+window_ms: 25.0
-  random_seed: 0
+use_dB_normalization: True 
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True 
+random_seed: 0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 2
+sortagrad: True 
 shuffle_method: batch_shuffle
 num_workers: 2
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: conformer
+# encoder related
-    encoder_conf:
+encoder: conformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -62,9 +61,9 @@ model:
    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
    use_dynamic_left_chunk: false
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -73,48 +72,27 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-
+###########################################
-training:
+#                Training                 #
-  n_epoch: 240
+###########################################
-  accum_grad: 4
+n_epoch: 240
-  global_grad_clip: 5.0
+accum_grad: 4
-  optim: adam
+global_grad_clip: 5.0
-  optim_conf:
+optim: adam
 optim_conf:
  lr: 0.001
-    weight_decay: 1e-6
+  weight_decay: 1.0e-6
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 100
+log_interval: 100
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: true  # simulate streaming inference. Defaults to False.
--- a/examples/callcenter/asr1/conf/conformer.yaml
+++ b/examples/callcenter/asr1/conf/conformer.yaml
@ -1,47 +1,44 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
-  min_input_len: 0.5
+dev_manifest: data/manifest.dev
-  max_input_len: 20.0 # second
+test_manifest: data/manifest.test
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.0
  max_output_input_ratio: .inf 
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt 
+#              Dataloader                 #
-  unit_type: 'char'
+###########################################
-  spm_model_prefix: ''
+vocab_filepath: data/lang_char/vocab.txt 
-  augmentation_config: conf/preprocess.yaml
+unit_type: 'char'
-  batch_size: 32
+spm_model_prefix: ''
-  raw_wav: True  # use raw_wav or kaldi feature
+preprocess_config: conf/preprocess.yaml
-  spectrum_type: fbank #linear, mfcc, fbank
+feat_dim: 80
-  feat_dim: 80
+stride_ms: 10.0
-  delta_delta: False
+window_ms: 25.0
-  dither: 1.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  target_sample_rate: 8000
+batch_size: 64
-  max_freq: None
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  n_fft: None
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  stride_ms: 10.0
+minibatches: 0 # for debug
-  window_ms: 25.0
+batch_count: auto
-  use_dB_normalization: True 
+batch_bins: 0 
-  target_dB: -20
+batch_frames_in: 0
-  random_seed: 0
+batch_frames_out: 0
-  keep_transcription_text: False
+batch_frames_inout: 0
-  sortagrad: True 
+num_workers: 0
-  shuffle_method: batch_shuffle
+subsampling_factor: 1
-  num_workers: 2
+num_encs: 1
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: conformer
+# encoder related
-    encoder_conf:
+encoder: conformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -57,9 +54,9 @@ model:
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -68,50 +65,28 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-training:
+###########################################
-  n_epoch: 100 # 50 will be lowest 
+#                Training                 #
-  accum_grad: 4
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 100 # 50 will be lowest 
-  optim: adam
+accum_grad: 4
-  optim_conf:
+global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.002
-    weight_decay: 1e-6
+  weight_decay: 1.0e-6
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 100
+log_interval: 100
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/callcenter/asr1/conf/preprocess.yaml
+++ b/examples/callcenter/asr1/conf/preprocess.yaml
@ -1,7 +1,7 @@
 process:
  # extract kaldi fbank from PCM
  - type: fbank_kaldi
-    fs: 16000
+    fs: 8000
    n_mels: 80
    n_shift: 160
    win_length: 400
--- a/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml
+++ b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml
@ -0,0 +1,11 @@
 decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 beam_size: 10
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: true  # simulate streaming inference. Defaults to False.
--- a/examples/callcenter/asr1/conf/tuning/decode.yaml
+++ b/examples/callcenter/asr1/conf/tuning/decode.yaml
@ -0,0 +1,13 @@
 decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 beam_size: 10
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/callcenter/asr1/local/align.sh
+++ b/examples/callcenter/asr1/local/align.sh
@ -1,7 +1,7 @@
 #! /usr/bin/env bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 ckpt_name=$(basename ${ckpt_prefxi})
@ -25,9 +26,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 if [ $? -ne 0 ]; then
    echo "Failed in ctc alignment!"
--- a/examples/callcenter/asr1/local/test.sh
+++ b/examples/callcenter/asr1/local/test.sh
@ -1,7 +1,7 @@
 #! /usr/bin/env bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 ckpt_name=$(basename ${ckpt_prefxi})
@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/callcenter/asr1/run.sh
+++ b/examples/callcenter/asr1/run.sh
@ -4,8 +4,9 @@ source path.sh
 gpus=0,1,2,3
 stage=0
-stop_stage=100
+stop_stage=50
 conf_path=conf/conformer.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=20
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -31,15 +32,15 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
    # export ckpt avg_n
    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi
--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
@ -1,68 +1,65 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev-clean
+###########################################
-  test_manifest: data/manifest.test-clean
+train_manifest: data/manifest.train
-  min_input_len: 0.0
+dev_manifest: data/manifest.dev-clean
-  max_input_len: 30.0 # second
+test_manifest: data/manifest.test-clean
-  min_output_len: 0.0
+min_input_len: 0.0
-  max_output_len: .inf
+max_input_len: 30.0 # second
-  min_output_input_ratio: 0.00
+min_output_len: 0.0
-  max_output_input_ratio: .inf
+max_output_len: .inf
 min_output_input_ratio: 0.00
 max_output_input_ratio: .inf
-collator:
+###########################################
-  batch_size: 20
+#              Dataloader                 #
-  mean_std_filepath: data/mean_std.json
+###########################################
-  unit_type: char
+batch_size: 20
-  vocab_filepath: data/lang_char/vocab.txt
+mean_std_filepath: data/mean_std.json
-  augmentation_config: conf/augmentation.json
+unit_type: char
-  random_seed: 0
+vocab_filepath: data/lang_char/vocab.txt
-  spm_model_prefix: 
+augmentation_config: conf/augmentation.json
-  spectrum_type: linear
+random_seed: 0
-  target_sample_rate: 16000
+spm_model_prefix: 
-  max_freq: None
+spectrum_type: linear
-  n_fft: None
+feat_dim: 
-  stride_ms: 10.0
+target_sample_rate: 16000
-  window_ms: 20.0
+max_freq: None
-  delta_delta: False
+n_fft: None
-  dither: 1.0
+stride_ms: 10.0
-  use_dB_normalization: True 
+window_ms: 20.0
-  target_dB: -20
+delta_delta: False
-  random_seed: 0
+dither: 1.0
-  keep_transcription_text: False
+use_dB_normalization: True 
-  sortagrad: True 
+target_dB: -20
-  shuffle_method: batch_shuffle
+random_seed: 0
-  num_workers: 2
+keep_transcription_text: False
 sortagrad: True 
 shuffle_method: batch_shuffle
 num_workers: 2
-model:
+############################################
-  num_conv_layers: 2
+#           Network Architecture           #
-  num_rnn_layers: 3
+############################################
-  rnn_layer_size: 2048
+num_conv_layers: 2
-  use_gru: False 
+num_rnn_layers: 3
-  share_rnn_weights: True
+rnn_layer_size: 2048
-  blank_id: 0
+use_gru: False 
 share_rnn_weights: True
 blank_id: 0
-training:
+###########################################
-  n_epoch: 50
+#                Training                 #
-  accum_grad: 1
+###########################################
-  lr: 1e-3
+n_epoch: 50
-  lr_decay: 0.83
+accum_grad: 1
-  weight_decay: 1e-06
+lr: 1e-3
-  global_grad_clip: 5.0
+lr_decay: 0.83
-  log_interval: 100
+weight_decay: 1e-06
-  checkpoint:
+global_grad_clip: 5.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 128
  error_rate_type: wer
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 1.9
  beta: 0.3
  beam_size: 500
  cutoff_prob: 1.0
  cutoff_top_n: 40
  num_proc_bsearch: 8
--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
@ -1,70 +1,67 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev-clean
+###########################################
-  test_manifest: data/manifest.test-clean
+train_manifest: data/manifest.train
-  min_input_len: 0.0
+dev_manifest: data/manifest.dev-clean
-  max_input_len: 30.0 # second
+test_manifest: data/manifest.test-clean
-  min_output_len: 0.0
+min_input_len: 0.0
-  max_output_len: .inf
+max_input_len: 30.0 # second
-  min_output_input_ratio: 0.00
+min_output_len: 0.0
-  max_output_input_ratio: .inf
+max_output_len: .inf
 min_output_input_ratio: 0.00
 max_output_input_ratio: .inf
-collator:
+###########################################
-  batch_size: 15
+#              Dataloader                 #
-  mean_std_filepath: data/mean_std.json
+###########################################
-  unit_type: char
+batch_size: 15
-  vocab_filepath: data/lang_char/vocab.txt
+mean_std_filepath: data/mean_std.json
-  augmentation_config: conf/augmentation.json
+unit_type: char
-  random_seed: 0
+vocab_filepath: data/lang_char/vocab.txt
-  spm_model_prefix: 
+augmentation_config: conf/augmentation.json
-  spectrum_type: linear
+random_seed: 0
-  target_sample_rate: 16000
+spm_model_prefix: 
-  max_freq: None
+spectrum_type: linear
-  n_fft: None
+feat_dim: 
-  stride_ms: 10.0
+target_sample_rate: 16000
-  window_ms: 20.0
+max_freq: None
-  delta_delta: False
+n_fft: None
-  dither: 1.0
+stride_ms: 10.0
-  use_dB_normalization: True 
+window_ms: 20.0
-  target_dB: -20
+delta_delta: False
-  random_seed: 0
+dither: 1.0
-  keep_transcription_text: False
+use_dB_normalization: True 
-  sortagrad: True 
+target_dB: -20
-  shuffle_method: batch_shuffle
+random_seed: 0
-  num_workers: 0
+keep_transcription_text: False
 sortagrad: True 
 shuffle_method: batch_shuffle
 num_workers: 0
-model:
+############################################
-  num_conv_layers: 2
+#           Network Architecture           #
-  num_rnn_layers: 3
+############################################
-  rnn_layer_size: 2048
+num_conv_layers: 2
-  rnn_direction: forward
+num_rnn_layers: 3
-  num_fc_layers: 2
+rnn_layer_size: 2048
-  fc_layers_size_list: 512, 256
+rnn_direction: forward
-  use_gru: False 
+num_fc_layers: 2
-  blank_id: 0
+fc_layers_size_list: 512, 256
 use_gru: False 
 blank_id: 0
-training:
+###########################################
-  n_epoch: 50
+#                Training                 #
-  accum_grad: 4
+###########################################
-  lr: 1e-3
+n_epoch: 50
-  lr_decay: 0.83
+accum_grad: 4
-  weight_decay: 1e-06
+lr: 1e-3
-  global_grad_clip: 5.0
+lr_decay: 0.83
-  log_interval: 100
+weight_decay: 1e-06
-  checkpoint:
+global_grad_clip: 5.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 128
  error_rate_type: wer
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 1.9
  beta: 0.3
  beam_size: 500
  cutoff_prob: 1.0
  cutoff_top_n: 40
  num_proc_bsearch: 8
--- a/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml
+++ b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml
@ -0,0 +1,10 @@
 decode_batch_size: 128
 error_rate_type: wer
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
 alpha: 1.9
 beta: 0.3
 beam_size: 500
 cutoff_prob: 1.0
 cutoff_top_n: 40
 num_proc_bsearch: 8
--- a/examples/librispeech/asr0/conf/tuning/decode.yaml
+++ b/examples/librispeech/asr0/conf/tuning/decode.yaml
@ -0,0 +1,10 @@
 decode_batch_size: 128
 error_rate_type: wer
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
 alpha: 1.9
 beta: 0.3
 beam_size: 500
 cutoff_prob: 1.0
 cutoff_top_n: 40
 num_proc_bsearch: 8
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-model_type=$3
+ckpt_prefix=$3
 model_type=$4
 # download language model
 bash local/download_lm_en.sh
@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
--- a/examples/librispeech/asr0/local/test_wav.sh
+++ b/examples/librispeech/asr0/local/test_wav.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 4 ];then
+if [ $# != 5 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file"
    exit -1
 fi
@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-model_type=$3
+ckpt_prefix=$3
-audio_file=$4
+model_type=$4
 audio_file=$5
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
@ -33,6 +34,7 @@ fi
 python3 -u ${BIN_DIR}/test_wav.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type} \
--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=30
 model_type=offline
 audio_file=data/demo_002_en.wav
@ -33,7 +34,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
@ -43,5 +44,5 @@ fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
 fi
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@ -1,10 +1,11 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: conformer
+# encoder related
-    encoder_conf:
+encoder: conformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -24,9 +25,9 @@ model:
    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
    use_dynamic_left_chunk: false
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -35,69 +36,64 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt 
+#              Dataloader                 #
-  unit_type: 'spm'
+###########################################
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+vocab_filepath: data/lang_char/vocab.txt 
-  mean_std_filepath: ""
+unit_type: 'spm'
-  augmentation_config: conf/preprocess.yaml
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  feat_dim: 80
+mean_std_filepath: ""
-  stride_ms: 10.0
+preprocess_config: conf/preprocess.yaml
-  window_ms: 25.0
+feat_dim: 80
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+stride_ms: 10.0
-  batch_size: 16
+window_ms: 25.0
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+batch_size: 16
-  minibatches: 0 # for debug
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  batch_count: auto
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  batch_bins: 0 
+minibatches: 0 # for debug
-  batch_frames_in: 0
+batch_count: auto
-  batch_frames_out: 0
+batch_bins: 0 
-  batch_frames_inout: 0
+batch_frames_in: 0
-  augmentation_config: conf/preprocess.yaml 
+batch_frames_out: 0
-  num_workers: 0
+batch_frames_inout: 0 
-  subsampling_factor: 1
+num_workers: 0
-  num_encs: 1
+subsampling_factor: 1
 num_encs: 1
-training:
+###########################################
-  n_epoch: 120
+#                 Training                #
-  accum_grad: 8
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 120
-  optim: adam
+accum_grad: 8
-  optim_conf:
+global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.001
-    weight_decay: 1e-06 
+  weight_decay: 1.0e-06 
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
-  log_interval: 100
+  lr_decay: 1.0
-  checkpoint:
+log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
-decoding:
+
  batch_size: 128
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  beam_size: 10
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: true  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@ -1,10 +1,11 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: transformer
+# encoder related
-    encoder_conf:
+encoder: transformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -17,9 +18,9 @@ model:
    use_dynamic_chunk: true
    use_dynamic_left_chunk: false
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -28,76 +29,61 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt 
+#              Dataloader                 #
-  unit_type: 'spm'
+###########################################
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+vocab_filepath: data/lang_char/vocab.txt 
-  mean_std_filepath: ""
+unit_type: 'spm'
-  augmentation_config: conf/preprocess.yaml
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  feat_dim: 80
+mean_std_filepath: ""
-  stride_ms: 10.0
+preprocess_config: conf/preprocess.yaml
-  window_ms: 25.0
+feat_dim: 80
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+stride_ms: 10.0
-  batch_size: 64
+window_ms: 25.0
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+batch_size: 64
-  minibatches: 0 # for debug
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  batch_count: auto
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  batch_bins: 0 
+minibatches: 0 # for debug
-  batch_frames_in: 0
+batch_count: auto
-  batch_frames_out: 0
+batch_bins: 0 
-  batch_frames_inout: 0
+batch_frames_in: 0
-  augmentation_config: conf/preprocess.yaml 
+batch_frames_out: 0
-  num_workers: 0
+batch_frames_inout: 0
-  subsampling_factor: 1
+num_workers: 0
-  num_encs: 1
+subsampling_factor: 1
 num_encs: 1
-training:
+###########################################
-  n_epoch: 120
+#                 Training                #
-  accum_grad: 1
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 120
-  optim: adam
+accum_grad: 1
-  optim_conf:
+global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.001
-    weight_decay: 1e-06
+  weight_decay: 1.0e-06
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 100
+log_interval: 100
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 64
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: true  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@ -1,10 +1,11 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: conformer
+# encoder related
-    encoder_conf:
+encoder: conformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -20,9 +21,9 @@ model:
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -31,74 +32,65 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    ctc_grad_norm_type: null 
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test-clean
+train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test-clean
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt 
+#              Dataloader                 #
-  unit_type: 'spm'
+###########################################
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+vocab_filepath: data/lang_char/vocab.txt 
-  mean_std_filepath: ""
+unit_type: 'spm'
-  augmentation_config: conf/preprocess.yaml
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  feat_dim: 80
+mean_std_filepath: ""
-  stride_ms: 10.0
+preprocess_config: conf/preprocess.yaml
-  window_ms: 25.0
+feat_dim: 80
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+stride_ms: 10.0
-  batch_size: 16
+window_ms: 25.0
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+batch_size: 16
-  minibatches: 0 # for debug
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  batch_count: auto
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  batch_bins: 0 
+minibatches: 0 # for debug
-  batch_frames_in: 0
+batch_count: auto
-  batch_frames_out: 0
+batch_bins: 0 
-  batch_frames_inout: 0
+batch_frames_in: 0
-  augmentation_config: conf/preprocess.yaml 
+batch_frames_out: 0
-  num_workers: 0
+batch_frames_inout: 0
-  subsampling_factor: 1
+num_workers: 0
-  num_encs: 1
+subsampling_factor: 1
 num_encs: 1
-training:
+###########################################
-  n_epoch: 70
+#                 Training                #
-  accum_grad: 8
+###########################################
-  global_grad_clip: 3.0
+n_epoch: 70
-  optim: adam
+accum_grad: 8
-  optim_conf:
+global_grad_clip: 3.0
 optim: adam
 optim_conf:
  lr: 0.004
-    weight_decay: 1e-06
+  weight_decay: 1.0e-06
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 100
+log_interval: 100
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 64
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  beam_size: 10
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@ -1,10 +1,11 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: transformer
+# encoder related
-    encoder_conf:
+encoder: transformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -15,9 +16,9 @@ model:
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -26,85 +27,62 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test-clean
+train_manifest: data/manifest.train
-  min_input_len: 0.5  # second
+dev_manifest: data/manifest.dev
-  max_input_len: 30.0 # second
+test_manifest: data/manifest.test-clean
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 100.0
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt
+#              Dataloader                 #
-  unit_type: 'spm'
+###########################################
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+vocab_filepath: data/lang_char/vocab.txt
-  mean_std_filepath: ""
+unit_type: 'spm'
-  augmentation_config: conf/preprocess.yaml
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  feat_dim: 80
+mean_std_filepath: ""
-  stride_ms: 10.0
+preprocess_config: conf/preprocess.yaml
-  window_ms: 25.0
+feat_dim: 80
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+stride_ms: 10.0
-  batch_size: 32 
+window_ms: 25.0
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+batch_size: 32 
-  minibatches: 0 # for debug
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  batch_count: auto
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  batch_bins: 0 
+minibatches: 0 # for debug
-  batch_frames_in: 0
+batch_count: auto
-  batch_frames_out: 0
+batch_bins: 0 
-  batch_frames_inout: 0
+batch_frames_in: 0
-  augmentation_config: conf/preprocess.yaml 
+batch_frames_out: 0
-  num_workers: 0
+batch_frames_inout: 0
-  subsampling_factor: 1
+num_workers: 0
-  num_encs: 1
+subsampling_factor: 1
 num_encs: 1
-training:
+###########################################
-  n_epoch: 120 
+#                 Training                #
-  accum_grad: 4
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 120 
-  optim: adam
+accum_grad: 4
-  optim_conf:
+global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.004
-    weight_decay: 1e-06
+  weight_decay: 1.0e-06
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 100
+log_interval: 100
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 64
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml
+++ b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml
@ -0,0 +1,11 @@
 decode_batch_size: 128
 error_rate_type: wer
 decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 beam_size: 10
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: true  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr1/conf/tuning/decode.yaml
+++ b/examples/librispeech/asr1/conf/tuning/decode.yaml
@ -0,0 +1,11 @@
 decode_batch_size: 64
 error_rate_type: wer
 decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 beam_size: 10
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr1/local/align.sh
+++ b/examples/librispeech/asr1/local/align.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 batch_size=1
 output_dir=${ckpt_prefix}
@ -20,9 +21,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 if [ $? -ne 0 ]; then
    echo "Failed in ctc alignment!"
--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
@ -15,8 +15,8 @@ recog_set="test-clean"
 stage=0
 stop_stage=100
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        python3 -u ${BIN_DIR}/test.py \
            --ngpu ${ngpu} \
            --config ${config_path} \
            --decode_cfg ${decode_config_path} \
            --result_file ${ckpt_prefix}.${type}.rsl \
            --checkpoint_path ${ckpt_prefix} \
-            --opts decoding.decoding_method ${type} \
+            --opts decode.decoding_method ${type} \
-            --opts decoding.batch_size ${batch_size}
+            --opts decode.decode_batch_size ${batch_size}
        if [ $? -ne 0 ]; then
            echo "Failed in evaluation!"
@ -76,10 +78,11 @@ for type in ctc_greedy_search; do
    python3 -u ${BIN_DIR}/test.py \
        --ngpu ${ngpu} \
        --config ${config_path} \
        --decode_cfg ${decode_config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
+        --opts decode.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    python3 -u ${BIN_DIR}/test.py \
        --ngpu ${ngpu} \
        --config ${config_path} \
        --decode_cfg ${decode_config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
+        --opts decode.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/librispeech/asr1/local/test_wav.sh
+++ b/examples/librispeech/asr1/local/test_wav.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-audio_file=$3
+ckpt_prefix=$3
 audio_file=$4
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
@ -49,10 +50,11 @@ for type in attention_rescoring; do
    python3 -u ${BIN_DIR}/test_wav.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size} \
+    --opts decode.decode_batch_size ${batch_size} \
    --audio_file ${audio_file}
    #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
@ -8,6 +8,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=30
 audio_file=data/demo_002_en.wav
@ -34,17 +35,17 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
 if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
--- a/examples/librispeech/asr2/conf/decode/decode_base.yaml
+++ b/examples/librispeech/asr2/conf/decode/decode_base.yaml
@ -0,0 +1,11 @@
 decode_batch_size: 1
 error_rate_type: wer
 decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 beam_size: 10
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr2/conf/transformer.yaml
+++ b/examples/librispeech/asr2/conf/transformer.yaml
@ -1,11 +1,12 @@
 # https://yaml.org/type/float.html
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file:  
+############################################
-    cmvn_file_type: "json"
+cmvn_file:  
-    # encoder related
+cmvn_file_type: "json"
-    encoder: transformer
+# encoder related
-    encoder_conf:
+encoder: transformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -16,9 +17,9 @@ model:
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -27,45 +28,51 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test-clean
+train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test-clean
-collator:
+###########################################
-  vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
+#              Dataloader                 #
-  unit_type: spm
+###########################################
-  spm_model_prefix: data/lang_char/train_960_unigram5000
+vocab_filepath: data/lang_char/train_960_unigram5000_units.txt
-  feat_dim: 83
+unit_type: spm
-  stride_ms: 10.0
+spm_model_prefix: data/lang_char/train_960_unigram5000
-  window_ms: 25.0
+feat_dim: 83
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+stride_ms: 10.0
-  batch_size: 30 
+window_ms: 25.0
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+batch_size: 30 
-  minibatches: 0 # for debug
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  batch_count: auto
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  batch_bins: 0 
+minibatches: 0 # for debug
-  batch_frames_in: 0
+batch_count: auto
-  batch_frames_out: 0
+batch_bins: 0 
-  batch_frames_inout: 0
+batch_frames_in: 0
-  augmentation_config: conf/preprocess.yaml 
+batch_frames_out: 0
-  num_workers: 0
+batch_frames_inout: 0
-  subsampling_factor: 1
+preprocess_config: conf/preprocess.yaml 
-  num_encs: 1
+num_workers: 0
 subsampling_factor: 1
 num_encs: 1
-training:
+###########################################
-  n_epoch: 120
+#                Training                 #
-  accum_grad: 2
+###########################################
-  log_interval: 100
+n_epoch: 120
-  checkpoint:
+accum_grad: 2
 log_interval: 1
 checkpoint:
  kbest_n: 50
  latest_n: 5
@ -79,23 +86,5 @@ scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
 decoding:
  batch_size: 1
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr2/local/align.sh
+++ b/examples/librispeech/asr2/local/align.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path dict_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path dict_path ckpt_path_prefix"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-dict_path=$2
+decode_config_path=$2
-ckpt_prefix=$3
+dict_path=$3
 ckpt_prefix=$4
 batch_size=1
 output_dir=${ckpt_prefix}
@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \
 --dict-path ${dict_path} \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result-file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 if [ $? -ne 0 ]; then
    echo "Failed in ctc alignment!"
--- a/examples/librispeech/asr2/local/test.sh
+++ b/examples/librispeech/asr2/local/test.sh
@ -19,6 +19,7 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe}
 bpemodel=${bpeprefix}.model
 config_path=conf/transformer.yaml
 decode_config_path=conf/decode/decode_base.yaml
 dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
 ckpt_prefix=
@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco
            --ngpu ${ngpu} \
            --dict-path ${dict} \
            --config ${config_path} \
            --decode_cfg ${decode_config_path} \
            --checkpoint_path ${ckpt_prefix} \
            --result-file ${decode_dir}/data.JOB.json \
-            --opts decoding.decoding_method ${dmethd} \
+            --opts decode.decoding_method ${dmethd} \
-            --opts decoding.batch_size ${batch_size} \
+            --opts decode.decode_batch_size ${batch_size} \
-            --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
+            --opts test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict}
--- a/examples/librispeech/asr2/run.sh
+++ b/examples/librispeech/asr2/run.sh
@ -9,7 +9,8 @@ gpus=0,1,2,3,4,5,6,7
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
-dict_path=lang_char/train_960_unigram5000_units.txt
+decode_conf_path=conf/decode/decode_base.yaml
 dict_path=data/lang_char/train_960_unigram5000_units.txt
 avg_num=10
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -35,7 +36,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # attetion resocre decoder
-    ./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    ./local/test.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
@ -45,7 +46,7 @@ fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
--- a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml
+++ b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml
@ -1,67 +1,65 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
-  min_input_len: 0.0
+dev_manifest: data/manifest.dev
-  max_input_len: 27.0 # second
+test_manifest: data/manifest.test
-  min_output_len: 0.0
+min_input_len: 0.0
-  max_output_len: .inf
+max_input_len: 27.0 # second
-  min_output_input_ratio: 0.00
+min_output_len: 0.0
-  max_output_input_ratio: .inf
+max_output_len: .inf
 min_output_input_ratio: 0.00
 max_output_input_ratio: .inf
-collator:
+###########################################
-  batch_size: 64 # one gpu
+#              Dataloader                 #
-  mean_std_filepath: data/mean_std.npz
+###########################################
-  unit_type: char
+batch_size: 64 # one gpu
-  vocab_filepath: data/vocab.txt 
+mean_std_filepath: data/mean_std.npz
-  augmentation_config: conf/augmentation.json
+unit_type: char
-  random_seed: 0
+vocab_filepath: data/vocab.txt 
-  spm_model_prefix: 
+augmentation_config: conf/augmentation.json
-  spectrum_type: linear
+random_seed: 0
-  feat_dim: 
+spm_model_prefix: 
-  delta_delta: False
+spectrum_type: linear
-  stride_ms: 10.0
+feat_dim: 
-  window_ms: 20.0
+delta_delta: False
-  n_fft: None
+stride_ms: 10.0
-  max_freq: None
+window_ms: 20.0
-  target_sample_rate: 16000
+n_fft: None
-  use_dB_normalization: True
+max_freq: None
-  target_dB: -20
+target_sample_rate: 16000
-  dither: 1.0
+use_dB_normalization: True
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True
+dither: 1.0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 2
+sortagrad: True
 shuffle_method: batch_shuffle
 num_workers: 2
-model:
+############################################
-  num_conv_layers: 2
+#           Network Architecture           #
-  num_rnn_layers: 3
+############################################
-  rnn_layer_size: 1024
+num_conv_layers: 2
-  use_gru: True 
+num_rnn_layers: 3
-  share_rnn_weights: False
+rnn_layer_size: 1024
-  blank_id: 4333
+use_gru: True 
 share_rnn_weights: False
 blank_id: 4333
-training:
+###########################################
-  n_epoch: 80
+#                Training                 #
-  accum_grad: 1
+###########################################
-  lr: 2e-3
+n_epoch: 80
-  lr_decay: 0.83
+accum_grad: 1
-  weight_decay: 1e-06
+lr: 2e-3
-  global_grad_clip: 3.0
+lr_decay: 0.83
-  log_interval: 100
+weight_decay: 1e-06
-  checkpoint:
+global_grad_clip: 3.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
-decoding:
+  
  batch_size: 32
  error_rate_type: cer 
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
  alpha: 2.6
  beta: 5.0
  beam_size: 300
  cutoff_prob: 0.99
  cutoff_top_n: 40
  num_proc_bsearch: 8
--- a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml
+++ b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml
@ -0,0 +1,10 @@
 decode_batch_size: 32
 error_rate_type: cer 
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
 alpha: 2.6
 beta: 5.0
 beam_size: 300
 cutoff_prob: 0.99
 cutoff_top_n: 40
 num_proc_bsearch: 8
--- a/examples/other/1xt2x/aishell/local/test.sh
+++ b/examples/other/1xt2x/aishell/local/test.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-model_type=$3
+ckpt_prefix=$3
 model_type=$4
 # download language model
 bash local/download_lm_ch.sh
@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
--- a/examples/other/1xt2x/aishell/run.sh
+++ b/examples/other/1xt2x/aishell/run.sh
@ -5,6 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline
 gpus=2
@ -23,6 +24,6 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
 fi
--- a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml
@ -1,67 +1,64 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test-clean
+train_manifest: data/manifest.train
-  min_input_len: 0.0
+dev_manifest: data/manifest.dev
-  max_input_len: .inf # second
+test_manifest: data/manifest.test-clean
-  min_output_len: 0.0
+min_input_len: 0.0
-  max_output_len: .inf
+max_input_len: .inf # second
-  min_output_input_ratio: 0.00
+min_output_len: 0.0
-  max_output_input_ratio: .inf
+max_output_len: .inf
 min_output_input_ratio: 0.00
 max_output_input_ratio: .inf
-collator:
+###########################################
-  batch_size: 64 # one gpu
+#              Dataloader                 #
-  mean_std_filepath: data/mean_std.npz
+###########################################
-  unit_type: char
+batch_size: 64 # one gpu
-  vocab_filepath: data/vocab.txt 
+mean_std_filepath: data/mean_std.npz
-  augmentation_config: conf/augmentation.json
+unit_type: char
-  random_seed: 0
+vocab_filepath: data/vocab.txt 
-  spm_model_prefix: 
+augmentation_config: conf/augmentation.json
-  spectrum_type: linear
+random_seed: 0
-  feat_dim: 
+spm_model_prefix: 
-  delta_delta: False
+spectrum_type: linear
-  stride_ms: 10.0
+feat_dim: 
-  window_ms: 20.0
+delta_delta: False
-  n_fft: None
+stride_ms: 10.0
-  max_freq: None
+window_ms: 20.0
-  target_sample_rate: 16000
+n_fft: None
-  use_dB_normalization: True
+max_freq: None
-  target_dB: -20
+target_sample_rate: 16000
-  dither: 1.0
+use_dB_normalization: True
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True
+dither: 1.0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 2
+sortagrad: True
 shuffle_method: batch_shuffle
 num_workers: 2
-model:
+############################################
-  num_conv_layers: 2
+#           Network Architecture           #
-  num_rnn_layers: 3
+############################################
-  rnn_layer_size: 1024
+num_conv_layers: 2
-  use_gru: True
+num_rnn_layers: 3
-  share_rnn_weights: False
+rnn_layer_size: 1024
-  blank_id: 28
+use_gru: True
 share_rnn_weights: False
 blank_id: 28
-training:
+###########################################
-  n_epoch: 80
+#                Training                 #
-  accum_grad: 1
+###########################################
-  lr: 2e-3
+n_epoch: 80
-  lr_decay: 0.83
+accum_grad: 1
-  weight_decay: 1e-06
+lr: 2e-3
-  global_grad_clip: 3.0
+lr_decay: 0.83
-  log_interval: 100
+weight_decay: 1e-06
-  checkpoint:
+global_grad_clip: 3.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 32
  error_rate_type: wer 
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 1.4
  beta: 0.35
  beam_size: 500
  cutoff_prob: 1.0
  cutoff_top_n: 40
  num_proc_bsearch: 8
--- a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
+++ b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml
@ -0,0 +1,10 @@
 decode_batch_size: 32
 error_rate_type: wer 
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
 alpha: 1.4
 beta: 0.35
 beam_size: 500
 cutoff_prob: 1.0
 cutoff_top_n: 40
 num_proc_bsearch: 8
--- a/examples/other/1xt2x/baidu_en8k/local/test.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/test.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-model_type=$3
+ckpt_prefix=$3
 model_type=$4
 # download language model
 bash local/download_lm_en.sh
@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
--- a/examples/other/1xt2x/baidu_en8k/run.sh
+++ b/examples/other/1xt2x/baidu_en8k/run.sh
@ -5,6 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline
 gpus=0
@ -23,6 +24,6 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
 fi
--- a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
+++ b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml
@ -1,67 +1,64 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test-clean
+train_manifest: data/manifest.train
-  min_input_len: 0.0
+dev_manifest: data/manifest.dev
-  max_input_len: 1000.0 # second
+test_manifest: data/manifest.test-clean
-  min_output_len: 0.0
+min_input_len: 0.0
-  max_output_len: .inf
+max_input_len: 1000.0 # second
-  min_output_input_ratio: 0.00
+min_output_len: 0.0
-  max_output_input_ratio: .inf
+max_output_len: .inf
 min_output_input_ratio: 0.00
 max_output_input_ratio: .inf
-collator:
+###########################################
-  batch_size: 64 # one gpu
+#              Dataloader                 #
-  mean_std_filepath: data/mean_std.npz
+###########################################
-  unit_type: char
+batch_size: 64 # one gpu
-  vocab_filepath: data/vocab.txt 
+mean_std_filepath: data/mean_std.npz
-  augmentation_config: conf/augmentation.json
+unit_type: char
-  random_seed: 0
+vocab_filepath: data/vocab.txt 
-  spm_model_prefix: 
+augmentation_config: conf/augmentation.json
-  spectrum_type: linear
+random_seed: 0
-  feat_dim: 
+spm_model_prefix: 
-  delta_delta: False
+spectrum_type: linear
-  stride_ms: 10.0
+feat_dim: 
-  window_ms: 20.0
+delta_delta: False
-  n_fft: None
+stride_ms: 10.0
-  max_freq: None
+window_ms: 20.0
-  target_sample_rate: 16000
+n_fft: None
-  use_dB_normalization: True
+max_freq: None
-  target_dB: -20
+target_sample_rate: 16000
-  dither: 1.0
+use_dB_normalization: True
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True
+dither: 1.0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 2
+sortagrad: True
 shuffle_method: batch_shuffle
 num_workers: 2
-model:
+############################################
-  num_conv_layers: 2
+#           Network Architecture           #
-  num_rnn_layers: 3
+############################################
-  rnn_layer_size: 2048
+num_conv_layers: 2
-  use_gru: False
+num_rnn_layers: 3
-  share_rnn_weights: True
+rnn_layer_size: 2048
-  blank_id: 28
+use_gru: False
 share_rnn_weights: True
 blank_id: 28
-training:
+###########################################
-  n_epoch: 80
+#                Training                 #
-  accum_grad: 1
+###########################################
-  lr: 2e-3
+n_epoch: 80
-  lr_decay: 0.83
+accum_grad: 1
-  weight_decay: 1e-06
+lr: 2e-3
-  global_grad_clip: 3.0
+lr_decay: 0.83
-  log_interval: 100
+weight_decay: 1e-06
-  checkpoint:
+global_grad_clip: 3.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 32
  error_rate_type: wer 
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 500
  cutoff_prob: 1.0
  cutoff_top_n: 40
  num_proc_bsearch: 8
--- a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
+++ b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml
@ -0,0 +1,10 @@
 decode_batch_size: 32
 error_rate_type: wer 
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
 alpha: 2.5
 beta: 0.3
 beam_size: 500
 cutoff_prob: 1.0
 cutoff_top_n: 40
 num_proc_bsearch: 8
--- a/examples/other/1xt2x/librispeech/local/test.sh
+++ b/examples/other/1xt2x/librispeech/local/test.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-model_type=$3
+ckpt_prefix=$3
 model_type=$4
 # download language model
 bash local/download_lm_en.sh
@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
--- a/examples/other/1xt2x/librispeech/run.sh
+++ b/examples/other/1xt2x/librispeech/run.sh
@ -5,6 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline
 gpus=1
@ -23,5 +24,5 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
 fi
--- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py
+++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py
@ -13,8 +13,8 @@
 # limitations under the License.
 """Evaluation for DeepSpeech2 model."""
 from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -41,9 +41,13 @@ if __name__ == "__main__":
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
        decode_confs = CfgNode(new_allowed=True)
        decode_confs.merge_from_file(args.decode_cfg)
        config.decode = decode_confs
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
+++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
@ -120,20 +120,6 @@ class DeepSpeech2Model(nn.Layer):
    :rtype: tuple of LayerOutput
    """
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                num_conv_layers=2,  #Number of stacking convolution layers.
                num_rnn_layers=3,  #Number of stacking RNN layers.
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self,
                 feat_size,
                 dict_size,
@ -233,11 +219,11 @@ class DeepSpeech2Model(nn.Layer):
        """
        model = cls(feat_size=dataloader.collate_fn.feature_size,
                    dict_size=len(dataloader.collate_fn.vocab_list),
-                    num_conv_layers=config.model.num_conv_layers,
+                    num_conv_layers=config.num_conv_layers,
-                    num_rnn_layers=config.model.num_rnn_layers,
+                    num_rnn_layers=config.num_rnn_layers,
-                    rnn_size=config.model.rnn_layer_size,
+                    rnn_size=config.rnn_layer_size,
-                    use_gru=config.model.use_gru,
+                    use_gru=config.use_gru,
-                    share_rnn_weights=config.model.share_rnn_weights)
+                    share_rnn_weights=config.share_rnn_weights)
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
@ -250,7 +236,7 @@ class DeepSpeech2Model(nn.Layer):
        Parameters
        config: yacs.config.CfgNode
-            config.model
+            config
        Returns
        -------
        DeepSpeech2Model
--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py
@ -44,27 +44,11 @@ logger = Log(__name__).getlog()
 class DeepSpeech2Trainer(Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # training config
        default = CfgNode(
            dict(
                lr=5e-4,  # learning rate
                lr_decay=1.0,  # learning rate decay
                weight_decay=1e-6,  # the coeff of weight decay
                global_grad_clip=5.0,  # the global norm clip
                n_epoch=50,  # train epochs
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
    def train_batch(self, batch_index, batch_data, msg):
-        train_conf = self.config.training
+        train_conf = self.config
        start = time.time()
        # forward
@ -98,7 +82,7 @@ class DeepSpeech2Trainer(Trainer):
        iteration_time = time.time() - start
        msg += "train time: {:>.3f}s, ".format(iteration_time)
-        msg += "batch size: {}, ".format(self.config.collator.batch_size)
+        msg += "batch size: {}, ".format(self.config.batch_size)
        msg += "accum: {}, ".format(train_conf.accum_grad)
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
@ -126,7 +110,7 @@ class DeepSpeech2Trainer(Trainer):
                total_loss += float(loss) * num_utts
                valid_losses['val_loss'].append(float(loss))
-            if (i + 1) % self.config.training.log_interval == 0:
+            if (i + 1) % self.config.log_interval == 0:
                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
                valid_dump['val_history_loss'] = total_loss / num_seen_utts
@ -146,15 +130,15 @@ class DeepSpeech2Trainer(Trainer):
    def setup_model(self):
        config = self.config.clone()
        config.defrost()
-        config.model.feat_size = self.train_loader.collate_fn.feature_size
+        config.feat_size = self.train_loader.collate_fn.feature_size
-        #config.model.dict_size = self.train_loader.collate_fn.vocab_size
+        #config.dict_size = self.train_loader.collate_fn.vocab_size
-        config.model.dict_size = len(self.train_loader.collate_fn.vocab_list)
+        config.dict_size = len(self.train_loader.collate_fn.vocab_list)
        config.freeze()
        if self.args.model_type == 'offline':
-            model = DeepSpeech2Model.from_config(config.model)
+            model = DeepSpeech2Model.from_config(config)
        elif self.args.model_type == 'online':
-            model = DeepSpeech2ModelOnline.from_config(config.model)
+            model = DeepSpeech2ModelOnline.from_config(config)
        else:
            raise Exception("wrong model type")
        if self.parallel:
@ -163,17 +147,13 @@ class DeepSpeech2Trainer(Trainer):
        logger.info(f"{model}")
        layer_tools.print_params(model, logger.info)
-        grad_clip = ClipGradByGlobalNormWithLog(
+        grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
            config.training.global_grad_clip)
        lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
-            learning_rate=config.training.lr,
+            learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
            gamma=config.training.lr_decay,
            verbose=True)
        optimizer = paddle.optimizer.Adam(
            learning_rate=lr_scheduler,
            parameters=model.parameters(),
-            weight_decay=paddle.regularizer.L2Decay(
+            weight_decay=paddle.regularizer.L2Decay(config.weight_decay),
                config.training.weight_decay),
            grad_clip=grad_clip)
        self.model = model
@ -184,59 +164,59 @@ class DeepSpeech2Trainer(Trainer):
    def setup_dataloader(self):
        config = self.config.clone()
        config.defrost()
-        config.collator.keep_transcription_text = False
+        config.keep_transcription_text = False
-        config.data.manifest = config.data.train_manifest
+        config.manifest = config.train_manifest
        train_dataset = ManifestDataset.from_config(config)
-        config.data.manifest = config.data.dev_manifest
+        config.manifest = config.dev_manifest
        dev_dataset = ManifestDataset.from_config(config)
-        config.data.manifest = config.data.test_manifest
+        config.manifest = config.test_manifest
        test_dataset = ManifestDataset.from_config(config)
        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
                train_dataset,
-                batch_size=config.collator.batch_size,
+                batch_size=config.batch_size,
                num_replicas=None,
                rank=None,
                shuffle=True,
                drop_last=True,
-                sortagrad=config.collator.sortagrad,
+                sortagrad=config.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
+                shuffle_method=config.shuffle_method)
        else:
            batch_sampler = SortagradBatchSampler(
                train_dataset,
                shuffle=True,
-                batch_size=config.collator.batch_size,
+                batch_size=config.batch_size,
                drop_last=True,
-                sortagrad=config.collator.sortagrad,
+                sortagrad=config.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
+                shuffle_method=config.shuffle_method)
        collate_fn_train = SpeechCollator.from_config(config)
-        config.collator.augmentation_config = ""
+        config.augmentation_config = ""
        collate_fn_dev = SpeechCollator.from_config(config)
-        config.collator.keep_transcription_text = True
+        config.keep_transcription_text = True
-        config.collator.augmentation_config = ""
+        config.augmentation_config = ""
        collate_fn_test = SpeechCollator.from_config(config)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
            collate_fn=collate_fn_train,
-            num_workers=config.collator.num_workers)
+            num_workers=config.num_workers)
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=config.collator.batch_size,
+            batch_size=config.batch_size,
            shuffle=False,
            drop_last=False,
            collate_fn=collate_fn_dev)
        self.test_loader = DataLoader(
            test_dataset,
-            batch_size=config.decoding.batch_size,
+            batch_size=config.decode.decode_batch_size,
            shuffle=False,
            drop_last=False,
            collate_fn=collate_fn_test)
@ -250,31 +230,10 @@ class DeepSpeech2Trainer(Trainer):
 class DeepSpeech2Tester(DeepSpeech2Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # testing config
        default = CfgNode(
            dict(
                alpha=2.5,  # Coef of LM for beam search.
                beta=0.3,  # Coef of WC for beam search.
                cutoff_prob=1.0,  # Cutoff probability for pruning.
                cutoff_top_n=40,  # Cutoff number for pruning.
                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
                num_proc_bsearch=8,  # # of CPUs for beam search.
                beam_size=500,  # Beam search width.
                batch_size=128,  # decoding batch size
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        self._text_featurizer = TextFeaturizer(
-            unit_type=config.collator.unit_type, vocab_filepath=None)
+            unit_type=config.unit_type, vocab=None)
        super().__init__(config, args)
    def ordid2token(self, texts, texts_len):
@ -293,7 +252,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
                        texts,
                        texts_len,
                        fout=None):
-        cfg = self.config.decoding
+        cfg = self.config.decode
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
@ -399,31 +358,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            self.export()
        except KeyboardInterrupt:
            exit(-1)
    def setup(self):
        """Setup the experiment.
        """
        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
        self.setup_dataloader()
        self.setup_model()
        self.iteration = 0
        self.epoch = 0
    def setup_output_dir(self):
        """Create a directory used for output.
        """
        # output dir
        if self.args.output:
            output_dir = Path(self.args.output).expanduser()
            output_dir.mkdir(parents=True, exist_ok=True)
        else:
            output_dir = Path(
                self.args.checkpoint_path).expanduser().parent.parent
            output_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir = output_dir
--- a/examples/ted_en_zh/st0/conf/preprocess.yaml
+++ b/examples/ted_en_zh/st0/conf/preprocess.yaml
@ -0,0 +1,25 @@
 process:
  # extract kaldi fbank from PCM
  - type: fbank_kaldi
    fs: 16000
    n_mels: 80
    n_shift: 160
    win_length: 400
    dither: 0.1
  - type: cmvn_json
    cmvn_path: data/mean_std.json
  # these three processes are a.k.a. SpecAugument
  - type: time_warp
    max_time_warp: 5
    inplace: true
    mode: PIL
  - type: freq_mask
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: time_mask
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false
--- a/examples/ted_en_zh/st0/conf/transformer.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer.yaml
@ -1,50 +1,55 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
-  min_input_len: 0.05  # second
+dev_manifest: data/manifest.dev
-  max_input_len: 30.0 # second
+test_manifest: data/manifest.test
-  min_output_len: 0.0 # tokens
+min_input_len: 0.05  # second
-  max_output_len: 400.0 # tokens
+max_input_len: 30.0 # second
-  min_output_input_ratio: 0.01
+min_output_len: 0.0 # tokens
-  max_output_input_ratio: 20.0
+max_output_len: 400.0 # tokens
 min_output_input_ratio: 0.01
 max_output_input_ratio: 20.0
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt
+#              Dataloader                 #
-  unit_type: 'spm'
+###########################################
-  spm_model_prefix: data/lang_char/bpe_unigram_8000
+vocab_filepath: data/lang_char/vocab.txt
-  mean_std_filepath: ""
+unit_type: 'spm'
-  augmentation_config: conf/preprocess.yaml
+spm_model_prefix: data/lang_char/bpe_unigram_8000
-  batch_size: 16
+mean_std_filepath: ""
-  maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
+preprocess_config: conf/preprocess.yaml
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+batch_size: 16
-  raw_wav: True  # use raw_wav or kaldi feature
+maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
-  spectrum_type: fbank #linear, mfcc, fbank
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  feat_dim: 80
+raw_wav: True  # use raw_wav or kaldi feature
-  delta_delta: False
+spectrum_type: fbank #linear, mfcc, fbank
-  dither: 1.0
+feat_dim: 80
-  target_sample_rate: 16000
+delta_delta: False
-  max_freq: None
+dither: 1.0
-  n_fft: None
+target_sample_rate: 16000
-  stride_ms: 10.0
+max_freq: None
-  window_ms: 25.0
+n_fft: None
-  use_dB_normalization: True
+stride_ms: 10.0
-  target_dB: -20
+window_ms: 25.0
-  random_seed: 0
+use_dB_normalization: True
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True 
+random_seed: 0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 2
+sortagrad: True 
 shuffle_method: batch_shuffle
 num_workers: 2
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: "data/mean_std.json"
+############################################
-    cmvn_file_type: "json"
+cmvn_file: "data/mean_std.json"
-    # encoder related
+cmvn_file_type: "json"
-    encoder: transformer
+# encoder related
-    encoder_conf:
+encoder: transformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -55,9 +60,9 @@ model:
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -66,49 +71,28 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    asr_weight: 0.0
    ctc_weight: 0.0
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-
+###########################################
-training:
+#                Training                 #
-  n_epoch: 120
+###########################################
-  accum_grad: 2
+n_epoch: 120
-  global_grad_clip: 5.0
+accum_grad: 2
-  optim: adam
+global_grad_clip: 5.0
-  optim_conf:
+optim: adam
 optim_conf:
  lr: 2.5
-    weight_decay: 1e-06
+  weight_decay: 1.0e-06
-  scheduler: noam    
+scheduler: noam    
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 50
+log_interval: 50
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 5
  error_rate_type: char-bleu
  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  word_reward: 0.7
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
@ -1,50 +1,55 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
-  min_input_len: 0.05  # second
+dev_manifest: data/manifest.dev
-  max_input_len: 30.0 # second
+test_manifest: data/manifest.test
-  min_output_len: 0.0 # tokens
+min_input_len: 0.05  # second
-  max_output_len: 400.0 # tokens
+max_input_len: 30.0 # second
-  min_output_input_ratio: 0.01
+min_output_len: 0.0 # tokens
-  max_output_input_ratio: 20.0
+max_output_len: 400.0 # tokens
 min_output_input_ratio: 0.01
 max_output_input_ratio: 20.0
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt
+#              Dataloader                 #
-  unit_type: 'spm'
+###########################################
-  spm_model_prefix: data/lang_char/bpe_unigram_8000
+vocab_filepath: data/lang_char/vocab.txt
-  mean_std_filepath: ""
+unit_type: 'spm'
-  augmentation_config: conf/preprocess.yaml
+spm_model_prefix: data/lang_char/bpe_unigram_8000
-  batch_size: 16
+mean_std_filepath: ""
-  maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
+preprocess_config: conf/preprocess.yaml
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+batch_size: 16
-  raw_wav: True  # use raw_wav or kaldi feature
+maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
-  spectrum_type: fbank #linear, mfcc, fbank
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  feat_dim: 80
+raw_wav: True  # use raw_wav or kaldi feature
-  delta_delta: False
+spectrum_type: fbank #linear, mfcc, fbank
-  dither: 1.0
+feat_dim: 80
-  target_sample_rate: 16000
+delta_delta: False
-  max_freq: None
+dither: 1.0
-  n_fft: None
+target_sample_rate: 16000
-  stride_ms: 10.0
+max_freq: None
-  window_ms: 25.0
+n_fft: None
-  use_dB_normalization: True
+stride_ms: 10.0
-  target_dB: -20
+window_ms: 25.0
-  random_seed: 0
+use_dB_normalization: True
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True 
+random_seed: 0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 2
+sortagrad: True 
 shuffle_method: batch_shuffle
 num_workers: 2
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: "data/mean_std.json"
+############################################
-    cmvn_file_type: "json"
+cmvn_file: "data/mean_std.json"
-    # encoder related
+cmvn_file_type: "json"
-    encoder: transformer
+# encoder related
-    encoder_conf:
+encoder: transformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -55,9 +60,9 @@ model:
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -66,49 +71,32 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    asr_weight: 0.5
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-training:
+###########################################
-  n_epoch: 120
+#                Training                 #
-  accum_grad: 2
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 120
-  optim: adam
+accum_grad: 2
-  optim_conf:
+global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 2.5
-    weight_decay: 1e-06
+  weight_decay: 1.0e-06
-  scheduler: noam    
+scheduler: noam    
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 50
+log_interval: 50
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 5
  error_rate_type: char-bleu
  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  word_reward: 0.7
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/st0/conf/tuning/decode.yaml
+++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml
@ -0,0 +1,11 @@
 batch_size: 5
 error_rate_type: char-bleu
 decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
 beam_size: 10
 word_reward: 0.7
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/st0/local/test.sh
+++ b/examples/ted_en_zh/st0/local/test.sh
@ -1,7 +1,7 @@
 #! /usr/bin/env bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,16 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 for type in fullsentence; do
    echo "decoding ${type}"
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/ted_en_zh/st0/run.sh
+++ b/examples/ted_en_zh/st0/run.sh
@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/transformer_mtl_noam.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -32,7 +33,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
--- a/examples/ted_en_zh/st1/conf/preprocess.yaml
+++ b/examples/ted_en_zh/st1/conf/preprocess.yaml
@ -0,0 +1,16 @@
 process:
  # these three processes are a.k.a. SpecAugument
  - type: time_warp
    max_time_warp: 5
    inplace: true
    mode: PIL
  - type: freq_mask
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: time_mask
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false
--- a/examples/ted_en_zh/st1/conf/transformer.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer.yaml
@ -1,42 +1,46 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test
-collator:
+###########################################
-  vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
+#              Dataloader                 #
-  unit_type: 'spm'
+###########################################
-  spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
+vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
-  mean_std_filepath: ""
+unit_type: 'spm'
-  # augmentation_config: conf/augmentation.json
+spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
-  batch_size: 20
+mean_std_filepath: ""
-  feat_dim: 83
+# preprocess_config: conf/augmentation.json
-  stride_ms: 10.0
+batch_size: 20
-  window_ms: 25.0
+feat_dim: 83
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+stride_ms: 10.0
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+window_ms: 25.0
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  minibatches: 0 # for debug
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  batch_count: auto
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  batch_bins: 0 
+minibatches: 0 # for debug
-  batch_frames_in: 0
+batch_count: auto
-  batch_frames_out: 0
+batch_bins: 0 
-  batch_frames_inout: 0
+batch_frames_in: 0
-  augmentation_config:
+batch_frames_out: 0
-  num_workers: 0
+batch_frames_inout: 0
-  subsampling_factor: 1
+preprocess_config:
-  num_encs: 1
+num_workers: 0
 subsampling_factor: 1
 num_encs: 1
-
+############################################
-# network architecture
+#           Network Architecture           #
-model:
+############################################
-    cmvn_file: None
+cmvn_file: None
-    cmvn_file_type: "json"
+cmvn_file_type: "json"
-    # encoder related
+# encoder related
-    encoder: transformer
+encoder: transformer
-    encoder_conf:
+encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -47,9 +51,9 @@ model:
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -58,47 +62,29 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    asr_weight: 0.0
    ctc_weight: 0.0
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-training:
+###########################################
-  n_epoch: 40
+#                Training                 #
-  accum_grad: 2
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 40
-  optim: adam
+accum_grad: 2
-  optim_conf:
+global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 2.5
  weight_decay: 0.
-  scheduler: noam    
+scheduler: noam    
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 50
+log_interval: 50
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 5
  error_rate_type: char-bleu
  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  word_reward: 0.7
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@ -1,42 +1,46 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test
-collator:
+###########################################
-  vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
+#              Dataloader                 #
-  unit_type: 'spm'
+###########################################
-  spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
+vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
-  mean_std_filepath: ""
+unit_type: 'spm'
-  # augmentation_config: conf/augmentation.json
+spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
-  batch_size: 20
+mean_std_filepath: ""
-  feat_dim: 83
+# preprocess_config: conf/augmentation.json
-  stride_ms: 10.0
+batch_size: 20
-  window_ms: 25.0
+feat_dim: 83
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+stride_ms: 10.0
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+window_ms: 25.0
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  minibatches: 0 # for debug
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  batch_count: auto
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  batch_bins: 0 
+minibatches: 0 # for debug
-  batch_frames_in: 0
+batch_count: auto
-  batch_frames_out: 0
+batch_bins: 0 
-  batch_frames_inout: 0
+batch_frames_in: 0
-  augmentation_config:
+batch_frames_out: 0
-  num_workers: 0
+batch_frames_inout: 0
-  subsampling_factor: 1
+preprocess_config:
-  num_encs: 1
+num_workers: 0
 subsampling_factor: 1
 num_encs: 1
-
+############################################
-# network architecture
+#           Network Architecture           #
-model:
+############################################
-    cmvn_file: None
+cmvn_file: None
-    cmvn_file_type: "json"
+cmvn_file_type: "json"
-    # encoder related
+# encoder related
-    encoder: transformer
+encoder: transformer
-    encoder_conf:
+encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -47,9 +51,9 @@ model:
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -58,47 +62,29 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    asr_weight: 0.5
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-training:
+###########################################
-  n_epoch: 40
+#                Training                 #
-  accum_grad: 2
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 40
-  optim: adam
+accum_grad: 2
-  optim_conf:
+global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 2.5
  weight_decay: 0.
-  scheduler: noam    
+scheduler: noam    
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 50
+log_interval: 50
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 5
  error_rate_type: char-bleu
  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  word_reward: 0.7
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/st1/conf/tuning/decode.yaml
+++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml
@ -0,0 +1,12 @@
 batch_size: 5
 error_rate_type: char-bleu
 decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
 beam_size: 10
 word_reward: 0.7
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/st1/local/test.sh
+++ b/examples/ted_en_zh/st1/local/test.sh
@ -1,7 +1,7 @@
 #! /usr/bin/env bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,16 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 for type in fullsentence; do
    echo "decoding ${type}"
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/ted_en_zh/st1/run.sh
+++ b/examples/ted_en_zh/st1/run.sh
@ -7,6 +7,7 @@ gpus=0,1,2,3
 stage=1
 stop_stage=4
 conf_path=conf/transformer_mtl_noam.yaml
 decode_conf_path=conf/tuning/decode.yaml
 ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
@ -38,5 +39,5 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
--- a/examples/timit/asr1/conf/transformer.yaml
+++ b/examples/timit/asr1/conf/transformer.yaml
@ -1,47 +1,45 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
-  min_input_len: 0.0  # second
+dev_manifest: data/manifest.dev
-  max_input_len: 10.0 # second
+test_manifest: data/manifest.test
  min_output_len: 0.0 # tokens
  max_output_len: 150.0 # tokens
  min_output_input_ratio: 0.005
  max_output_input_ratio: 1000.0
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt
+#              Dataloader                 #
-  unit_type: "word"
+###########################################
-  mean_std_filepath: ""
+vocab_filepath: data/lang_char/vocab.txt
-  augmentation_config: conf/preprocess.yaml
+spm_model_prefix: ''
-  batch_size: 64
+unit_type: "word"
-  raw_wav: True  # use raw_wav or kaldi feature
+mean_std_filepath: ""
-  spectrum_type: fbank #linear, mfcc, fbank
+preprocess_config: conf/preprocess.yaml
-  feat_dim: 80
+feat_dim: 80
-  delta_delta: False
+stride_ms: 10.0
-  dither: 1.0
+window_ms: 25.0
-  target_sample_rate: 16000
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  max_freq: None
+batch_size: 64
-  n_fft: None
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  stride_ms: 10.0
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  window_ms: 25.0
+minibatches: 0 # for debug
-  use_dB_normalization: True
+batch_count: auto
-  target_dB: -20
+batch_bins: 0 
-  random_seed: 0
+batch_frames_in: 0
-  keep_transcription_text: False
+batch_frames_out: 0
-  sortagrad: True 
+batch_frames_inout: 0
-  shuffle_method: batch_shuffle
+num_workers: 0
-  num_workers: 2
+subsampling_factor: 1
 num_encs: 1
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: transformer
+# encoder related
-    encoder_conf:
+encoder: transformer
 encoder_conf:
    output_size: 128    # dimension of attention
    attention_heads: 4
    linear_units: 1024  # the number of units of position-wise feed forward
@ -52,9 +50,9 @@ model:
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 1024
    num_blocks: 6
@ -63,48 +61,29 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.5
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-training:
+###########################################
-  n_epoch: 50
+#                Training                 #
-  accum_grad: 1
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 50
-  optim: adam
+accum_grad: 1
-  optim_conf:
+global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.004
-    weight_decay: 1e-06
+  weight_decay: 1.0e-6
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 1200
  lr_decay: 1.0
-  log_interval: 10
+log_interval: 10
-  checkpoint:
+checkpoint:
  kbest_n: 50
  latest_n: 5
 decoding:
  batch_size: 64
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/timit/asr1/conf/tuning/decode.yaml
+++ b/examples/timit/asr1/conf/tuning/decode.yaml
@ -0,0 +1,11 @@
 decode_batch_size: 64
 error_rate_type: wer
 decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 beam_size: 10
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/timit/asr1/local/align.sh
+++ b/examples/timit/asr1/local/align.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 batch_size=1
 output_dir=${ckpt_prefix}
@ -20,9 +21,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 if [ $? -ne 0 ]; then
    echo "Failed in ctc alignment!"
--- a/examples/timit/asr1/local/test.sh
+++ b/examples/timit/asr1/local/test.sh
@ -7,8 +7,8 @@ stop_stage=50
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        python3 -u ${BIN_DIR}/test.py \
        --ngpu ${ngpu} \
        --config ${config_path} \
        --decode_cfg ${decode_config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
+        --opts decode.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decode_batch_size ${batch_size}
        if [ $? -ne 0 ]; then
            echo "Failed in evaluation!"
@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        python3 -u ${BIN_DIR}/test.py \
        --ngpu ${ngpu}  \
        --config ${config_path} \
        --decode_cfg ${decode_config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
+        --opts decode.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decode_batch_size ${batch_size}
        if [ $? -ne 0 ]; then
            echo "Failed in evaluation!"
@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        python3 -u ${BIN_DIR}/test.py \
        --ngpu ${ngpu}  \
        --config ${config_path} \
        --decode_cfg ${decode_config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
+        --opts decode.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decode_batch_size ${batch_size}
        if [ $? -ne 0 ]; then
            echo "Failed in evaluation!"
--- a/examples/timit/asr1/run.sh
+++ b/examples/timit/asr1/run.sh
@ -7,6 +7,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=10
 TIMIT_path=/path/to/TIMIT
@ -34,15 +35,15 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
-# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
-#     # export ckpt avg_n
+     # export ckpt avg_n
-#     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+     CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-# fi
+fi
--- a/examples/tiny/asr0/conf/deepspeech2.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2.yaml
@ -1,70 +1,67 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.tiny
+#                   Data                  #
-  dev_manifest: data/manifest.tiny
+###########################################
-  test_manifest: data/manifest.tiny 
+train_manifest: data/manifest.tiny
-  min_input_len: 0.0
+dev_manifest: data/manifest.tiny
-  max_input_len: 30.0
+test_manifest: data/manifest.tiny 
-  min_output_len: 0.0
+min_input_len: 0.0
-  max_output_len: 400.0
+max_input_len: 30.0
-  min_output_input_ratio: 0.05
+min_output_len: 0.0
-  max_output_input_ratio: 10.0
+max_output_len: 400.0
 min_output_input_ratio: 0.05
 max_output_input_ratio: 10.0
-collator:
+###########################################
-  mean_std_filepath: data/mean_std.json
+#              Dataloader                 #
-  unit_type: char
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt
+mean_std_filepath: data/mean_std.json
-  augmentation_config: conf/augmentation.json
+unit_type: char
-  random_seed: 0
+vocab_filepath: data/lang_char/vocab.txt
-  spm_model_prefix: 
+augmentation_config: conf/augmentation.json
-  spectrum_type: linear
+random_seed: 0
-  feat_dim: 
+spm_model_prefix: 
-  delta_delta: False
+spectrum_type: linear
-  stride_ms: 10.0
+feat_dim: 
-  window_ms: 20.0
+delta_delta: False
-  n_fft: None
+stride_ms: 10.0
-  max_freq: None
+window_ms: 20.0
-  target_sample_rate: 16000
+n_fft: None
-  use_dB_normalization: True
+max_freq: None
-  target_dB: -20
+target_sample_rate: 16000
-  dither: 1.0
+use_dB_normalization: True
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True 
+dither: 1.0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 2
+sortagrad: True 
-  batch_size: 4
+shuffle_method: batch_shuffle
 num_workers: 2
 batch_size: 4
-model:
+############################################
-  num_conv_layers: 2
+#           Network Architecture           #
-  num_rnn_layers: 3
+############################################
-  rnn_layer_size: 2048
+num_conv_layers: 2
-  use_gru: False 
+num_rnn_layers: 3
-  share_rnn_weights: True 
+rnn_layer_size: 2048
-  blank_id: 0
+use_gru: False 
 share_rnn_weights: True 
 blank_id: 0
-training:
+###########################################
-  n_epoch: 5
+#                Training                 #
-  accum_grad: 1
+###########################################
-  lr: 1e-5 
+n_epoch: 5
-  lr_decay: 0.8 
+accum_grad: 1
-  weight_decay: 1e-06
+lr: 1e-5 
-  global_grad_clip: 5.0
+lr_decay: 0.8 
-  log_interval: 1
+weight_decay: 1e-06
-  checkpoint:
+global_grad_clip: 5.0
 log_interval: 1
 checkpoint:
  kbest_n: 3
  latest_n: 2
 decoding:
  batch_size: 128
  error_rate_type: wer
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 500
  cutoff_prob: 1.0
  cutoff_top_n: 40
  num_proc_bsearch: 8
--- a/examples/tiny/asr0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml
@ -1,72 +1,68 @@
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.tiny
+#                   Data                  #
-  dev_manifest: data/manifest.tiny
+###########################################
-  test_manifest: data/manifest.tiny 
+train_manifest: data/manifest.tiny
-  min_input_len: 0.0
+dev_manifest: data/manifest.tiny
-  max_input_len: 30.0
+test_manifest: data/manifest.tiny 
-  min_output_len: 0.0
+min_input_len: 0.0
-  max_output_len: 400.0
+max_input_len: 30.0
-  min_output_input_ratio: 0.05
+min_output_len: 0.0
-  max_output_input_ratio: 10.0
+max_output_len: 400.0
 min_output_input_ratio: 0.05
 max_output_input_ratio: 10.0
-collator:
+###########################################
-  mean_std_filepath: data/mean_std.json
+#              Dataloader                 #
-  unit_type: char
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt
+mean_std_filepath: data/mean_std.json
-  augmentation_config: conf/augmentation.json
+unit_type: char
-  random_seed: 0
+vocab_filepath: data/lang_char/vocab.txt
-  spm_model_prefix: 
+augmentation_config: conf/augmentation.json
-  spectrum_type: linear
+random_seed: 0
-  feat_dim: 
+spm_model_prefix: 
-  delta_delta: False
+spectrum_type: linear
-  stride_ms: 10.0
+feat_dim: 
-  window_ms: 20.0
+delta_delta: False
-  n_fft: None
+stride_ms: 10.0
-  max_freq: None
+window_ms: 20.0
-  target_sample_rate: 16000
+n_fft: None
-  use_dB_normalization: True
+max_freq: None
-  target_dB: -20
+target_sample_rate: 16000
-  dither: 1.0
+use_dB_normalization: True
-  keep_transcription_text: False
+target_dB: -20
-  sortagrad: True 
+dither: 1.0
-  shuffle_method: batch_shuffle
+keep_transcription_text: False
-  num_workers: 0
+sortagrad: True 
-  batch_size: 4
+shuffle_method: batch_shuffle
 num_workers: 0
 batch_size: 4
-model:
+############################################
-  num_conv_layers: 2
+#           Network Architecture           #
-  num_rnn_layers: 4
+############################################
-  rnn_layer_size: 2048
+num_conv_layers: 2
-  rnn_direction: forward
+num_rnn_layers: 4
-  num_fc_layers: 2
+rnn_layer_size: 2048
-  fc_layers_size_list: 512, 256
+rnn_direction: forward
-  use_gru: True 
+num_fc_layers: 2
-  blank_id: 0
+fc_layers_size_list: 512, 256
 use_gru: True 
 blank_id: 0
-training:
+###########################################
-  n_epoch: 5
+#                Training                 #
-  accum_grad: 1
+###########################################
-  lr: 1e-5 
+n_epoch: 5
-  lr_decay: 1.0 
+accum_grad: 1
-  weight_decay: 1e-06
+lr: 1e-5 
-  global_grad_clip: 5.0
+lr_decay: 1.0 
-  log_interval: 1
+weight_decay: 1e-06
-  checkpoint:
+global_grad_clip: 5.0
 log_interval: 1
 checkpoint:
  kbest_n: 3
  latest_n: 2
 decoding:
  batch_size: 128
  error_rate_type: wer
  decoding_method: ctc_beam_search
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 500
  cutoff_prob: 1.0
  cutoff_top_n: 40
  num_proc_bsearch: 8
--- a/examples/tiny/asr0/conf/tuning/chunk_decode.yaml
+++ b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml
@ -0,0 +1,10 @@
 decode_batch_size: 128
 error_rate_type: wer
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
 alpha: 2.5
 beta: 0.3
 beam_size: 500
 cutoff_prob: 1.0
 cutoff_top_n: 40
 num_proc_bsearch: 8
--- a/examples/tiny/asr0/conf/tuning/decode.yaml
+++ b/examples/tiny/asr0/conf/tuning/decode.yaml
@ -0,0 +1,10 @@
 decode_batch_size: 128
 error_rate_type: wer
 decoding_method: ctc_beam_search
 lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
 alpha: 2.5
 beta: 0.3
 beam_size: 500
 cutoff_prob: 1.0
 cutoff_top_n: 40
 num_proc_bsearch: 8
--- a/examples/tiny/asr0/local/test.sh
+++ b/examples/tiny/asr0/local/test.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix model_type"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-model_type=$3
+ckpt_prefix=$3
 model_type=$4
 # download language model
 bash local/download_lm_en.sh
@ -21,6 +22,7 @@ fi
 python3 -u ${BIN_DIR}/test.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type}
--- a/examples/tiny/asr0/run.sh
+++ b/examples/tiny/asr0/run.sh
@ -6,6 +6,7 @@ gpus=0
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 model_type=offline
@ -32,7 +33,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--- a/examples/tiny/asr1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@ -1,48 +1,11 @@
-# https://yaml.org/type/float.html
+############################################
-data:
+#           Network Architecture           #
-  train_manifest: data/manifest.tiny
+############################################
-  dev_manifest: data/manifest.tiny
+cmvn_file: "data/mean_std.json"
-  test_manifest: data/manifest.tiny
+cmvn_file_type: "json"
-  min_input_len: 0.5  # second
+# encoder related
-  max_input_len: 30.0 # second
+encoder: conformer
-  min_output_len: 0.0 # tokens
+encoder_conf:
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
 collator:
  mean_std_filepath: ""
  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
  augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
  use_dB_normalization: True
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 2
 # network architecture
 model:
    cmvn_file: "data/mean_std.json"
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
    encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -62,9 +25,9 @@ model:
    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
    use_dynamic_left_chunk: false
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -73,48 +36,62 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-training:
+###########################################
-  n_epoch: 5
+#                   Data                  #
-  accum_grad: 1
+###########################################
-  global_grad_clip: 5.0
+train_manifest: data/manifest.tiny
-  optim: adam
+dev_manifest: data/manifest.tiny
-  optim_conf:
+test_manifest: data/manifest.tiny
 ###########################################
 #              Dataloader                 #
 ###########################################
 mean_std_filepath: ""
 vocab_filepath: data/lang_char/vocab.txt 
 unit_type: 'spm'
 spm_model_prefix: 'data/lang_char/bpe_unigram_200'
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 4
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 preprocess_config: conf/preprocess.yaml 
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
 ###########################################
 #                 Training                #
 ###########################################
 n_epoch: 5
 accum_grad: 1
 global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.001
-    weight_decay: 1e-06
+  weight_decay: 1.0e-06
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 1
+log_interval: 1
-  checkpoint:
+checkpoint:
  kbest_n: 10
  latest_n: 1
 decoding:
  batch_size: 64
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/tiny/asr1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@ -1,48 +1,11 @@
-# https://yaml.org/type/float.html
+############################################
-data:
+#           Network Architecture           #
-  train_manifest: data/manifest.tiny
+############################################
-  dev_manifest: data/manifest.tiny
+cmvn_file: "data/mean_std.json"
-  test_manifest: data/manifest.tiny
+cmvn_file_type: "json"
-  min_input_len: 0.5  # second
+# encoder related
-  max_input_len: 20.0 # second
+encoder: transformer
-  min_output_len: 0.0 # tokens
+encoder_conf:
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
 collator:
  mean_std_filepath: ""
  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
  augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
  use_dB_normalization: True
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 2
 # network architecture
 model:
    cmvn_file: "data/mean_std.json"
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
    encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -55,9 +18,9 @@ model:
    use_dynamic_chunk: true
    use_dynamic_left_chunk: false
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -66,48 +29,63 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-training:
+# https://yaml.org/type/float.html
-  n_epoch: 5
+###########################################
-  accum_grad: 1
+#                   Data                  #
-  global_grad_clip: 5.0
+###########################################
-  optim: adam
+train_manifest: data/manifest.tiny
-  optim_conf:
+dev_manifest: data/manifest.tiny
 test_manifest: data/manifest.tiny
 ###########################################
 #              Dataloader                 #
 ###########################################
 mean_std_filepath: ""
 vocab_filepath: data/lang_char/vocab.txt 
 unit_type: 'spm'
 spm_model_prefix: 'data/lang_char/bpe_unigram_200'
 preprocess_config: conf/preprocess.yaml
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 4
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
 ###########################################
 #                 Training                #
 ###########################################
 n_epoch: 5
 accum_grad: 1
 global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.002
-    weight_decay: 1e-06
+  weight_decay: 1.0e-06
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 1
+log_interval: 1
-  checkpoint:
+checkpoint:
  kbest_n: 10
  latest_n: 1
 decoding:
  batch_size: 64
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/tiny/asr1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@ -1,48 +1,12 @@
 # https://yaml.org/type/float.html
-data:
+############################################
-  train_manifest: data/manifest.tiny
+#           Network Architecture           #
-  dev_manifest: data/manifest.tiny
+############################################
-  test_manifest: data/manifest.tiny
+cmvn_file: "data/mean_std.json"
-  min_input_len: 0.5  # second
+cmvn_file_type: "json"
-  max_input_len: 20.0 # second
+# encoder related
-  min_output_len: 0.0 # tokens
+encoder: conformer
-  max_output_len: 400.0 # tokens
+encoder_conf:
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
 collator:
  mean_std_filepath: ""
  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
  augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
  use_dB_normalization: True
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 2
 # network architecture
 model:
    cmvn_file: "data/mean_std.json"
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
    encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -58,9 +22,9 @@ model:
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -69,48 +33,65 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-training:
+###########################################
-  n_epoch: 5
+#                   Data                  #
-  accum_grad: 4
+###########################################
-  global_grad_clip: 5.0
+train_manifest: data/manifest.tiny
-  optim: adam
+dev_manifest: data/manifest.tiny
-  optim_conf:
+test_manifest: data/manifest.tiny
 ###########################################
 #              Dataloader                 #
 ###########################################
 mean_std_filepath: ""
 vocab_filepath: data/lang_char/vocab.txt 
 unit_type: 'spm'
 spm_model_prefix: 'data/lang_char/bpe_unigram_200'
 preprocess_config: conf/preprocess.yaml
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 4
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
 ###########################################
 #                 Training                #
 ###########################################
 n_epoch: 5
 accum_grad: 4
 global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.002
-    weight_decay: 1e-06
+  weight_decay: 1.0e-06
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 1
+log_interval: 1
-  checkpoint:
+checkpoint:
  kbest_n: 10
  latest_n: 1
 decoding:
  batch_size: 64
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/tiny/asr1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@ -1,47 +1,12 @@
 # https://yaml.org/type/float.html
-data:
+############################################
-  train_manifest: data/manifest.tiny
+#           Network Architecture           #
-  dev_manifest: data/manifest.tiny
+############################################
-  test_manifest: data/manifest.tiny
+cmvn_file: 
-  min_input_len: 0.5  # second
+cmvn_file_type: "json"
-  max_input_len: 20.0 # second
+# encoder related
-  min_output_len: 0.0 # tokens
+encoder: transformer
-  max_output_len: 400.0 # tokens
+encoder_conf:
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
 collator:
  mean_std_filepath: data/mean_std.json
  vocab_filepath: data/lang_char/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
  augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
  use_dB_normalization: True
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 2
 # network architecture
 model:
    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
    encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
@ -52,9 +17,9 @@ model:
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
@ -63,48 +28,63 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
 ###########################################
 #                   Data                  #
 ###########################################
 train_manifest: data/manifest.tiny
 dev_manifest: data/manifest.tiny
 test_manifest: data/manifest.tiny
 ###########################################
 #              Dataloader                 #
 ###########################################
 mean_std_filepath: data/mean_std.json
 vocab_filepath: data/lang_char/vocab.txt 
 unit_type: 'spm'
 spm_model_prefix: 'data/lang_char/bpe_unigram_200'
 preprocess_config: conf/preprocess.yaml
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 4
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
-training:
+
-  n_epoch: 5
+###########################################
-  accum_grad: 1
+#                 Training                #
-  global_grad_clip: 5.0
+###########################################
-  optim: adam
+n_epoch: 5
-  optim_conf:
+accum_grad: 1
 global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 0.002
-    weight_decay: 1e-06
+  weight_decay: 1.0e-06
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
-  log_interval: 1
+log_interval: 1
-  checkpoint:
+checkpoint:
  kbest_n: 2
  latest_n: 1
 decoding:
  batch_size: 8 #64
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/tiny/asr1/conf/tuning/chunk_decode.yaml
+++ b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml
@ -0,0 +1,11 @@
 decode_batch_size: 8 #64
 error_rate_type: wer
 decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 beam_size: 10
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/tiny/asr1/conf/tuning/decode.yaml
+++ b/examples/tiny/asr1/conf/tuning/decode.yaml
@ -0,0 +1,11 @@
 decode_batch_size: 8 #64
 error_rate_type: wer
 decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 beam_size: 10
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/tiny/asr1/local/align.sh
+++ b/examples/tiny/asr1/local/align.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 batch_size=1
 output_dir=${ckpt_prefix}
@ -20,9 +21,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --decode_cfg ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}
 if [ $? -ne 0 ]; then
    echo "Failed in ctc alignment!"
--- a/examples/tiny/asr1/local/test.sh
+++ b/examples/tiny/asr1/local/test.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/tiny/asr1/run.sh
+++ b/examples/tiny/asr1/run.sh
@ -6,6 +6,7 @@ gpus=0
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -31,12 +32,12 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
--- a/examples/wenetspeech/asr1/conf/conformer.yaml
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
@ -1,8 +1,11 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    # encoder related
+############################################
-    encoder: conformer
+cmvn_file: 
-    encoder_conf:
+cmvn_file_type: "json"
 # encoder related
 encoder: conformer
 encoder_conf:
    output_size: 512    # dimension of attention
    attention_heads: 8
    linear_units: 2048  # the number of units of position-wise feed forward
@ -19,9 +22,9 @@ model:
    pos_enc_layer_type: rel_pos
    selfattention_layer_type: rel_selfattn
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
    attention_heads: 8
    linear_units: 2048
    num_blocks: 6
@ -30,82 +33,60 @@ model:
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+# hybrid CTC/attention
-    model_conf:
+model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
 # https://yaml.org/type/float.html
-data:
+###########################################
-  train_manifest: data/manifest.train
+#                   Data                  #
-  dev_manifest: data/manifest.dev
+###########################################
-  test_manifest: data/manifest.test
+train_manifest: data/manifest.train
-  min_input_len: 0.1 # second
+dev_manifest: data/manifest.dev
-  max_input_len: 12.0 # second
+test_manifest: data/manifest.test
  min_output_len: 1.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt 
+#              Dataloader                 #
-  unit_type: 'char'
+###########################################
-  spm_model_prefix: ''
+vocab_filepath: data/lang_char/vocab.txt 
-  augmentation_config: conf/preprocess.yaml
+unit_type: 'char'
-  batch_size: 64
+preprocess_config: conf/preprocess.yaml
-  raw_wav: True  # use raw_wav or kaldi feature
+spm_model_prefix: ''
-  spectrum_type: fbank #linear, mfcc, fbank
+feat_dim: 80
-  feat_dim: 80
+stride_ms: 10.0
-  delta_delta: False
+window_ms: 25.0
-  dither: 1.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  target_sample_rate: 16000
+batch_size: 64
-  max_freq: None
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  n_fft: None
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  stride_ms: 10.0
+minibatches: 0 # for debug
-  window_ms: 25.0
+batch_count: auto
-  use_dB_normalization: True 
+batch_bins: 0 
-  target_dB: -20
+batch_frames_in: 0
-  random_seed: 0
+batch_frames_out: 0
-  keep_transcription_text: False
+batch_frames_inout: 0
-  sortagrad: True 
+num_workers: 0
-  shuffle_method: batch_shuffle
+subsampling_factor: 1
-  num_workers: 2
+num_encs: 1
-training:
+###########################################
-  n_epoch: 240 
+#                 Training                #
-  accum_grad: 16
+###########################################
-  global_grad_clip: 5.0
+n_epoch: 240 
-  log_interval: 100
+accum_grad: 16
-  checkpoint:
+global_grad_clip: 5.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
-  optim: adam
+optim: adam
-  optim_conf:
+optim_conf:
  lr: 0.001
-    weight_decay: 1e-6
+  weight_decay: 1.0e-6
-  scheduler: warmuplr     
+scheduler: warmuplr     
-  scheduler_conf:
+scheduler_conf:
  warmup_steps: 5000
  lr_decay: 1.0
 decoding:
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/wenetspeech/asr1/conf/tuning/decode.yaml
+++ b/examples/wenetspeech/asr1/conf/tuning/decode.yaml
@ -0,0 +1,11 @@
 decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 beam_size: 10
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/wenetspeech/asr1/local/test.sh
+++ b/examples/wenetspeech/asr1/local/test.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
 ckpt_prefix=$3
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    python3 -u ${BIN_DIR}/test.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/wenetspeech/asr1/local/test_wav.sh
+++ b/examples/wenetspeech/asr1/local/test_wav.sh
@ -1,7 +1,7 @@
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# != 4 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
    exit -1
 fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
-audio_file=$3
+ckpt_prefix=$3
 audio_file=$4
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
@ -43,10 +44,11 @@ for type in  attention_rescoring; do
    python3 -u ${BIN_DIR}/test_wav.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+    --opts decode.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size} \
+    --opts decode.decode_batch_size ${batch_size} \
    --audio_file ${audio_file}
    if [ $? -ne 0 ]; then
--- a/examples/wenetspeech/asr1/run.sh
+++ b/examples/wenetspeech/asr1/run.sh
@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7
 stage=0
 stop_stage=100
 conf_path=conf/conformer.yaml
-
+decode_conf_path=conf/tuning/decode.yaml
 average_checkpoint=true
 avg_num=10
@ -36,12 +36,12 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
@ -51,5 +51,5 @@ fi
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
    # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
--- a/paddlespeech/s2t/decoders/recog.py
+++ b/paddlespeech/s2t/decoders/recog.py
@ -85,7 +85,7 @@ def recog_v2(args):
        mode="asr",
        load_output=False,
        sort_in_input_length=False,
-        preprocess_conf=confs.collator.augmentation_config
+        preprocess_conf=confs.preprocess_config
        if args.preprocess_conf is None else args.preprocess_conf,
        preprocess_args={"train": False}, )
--- a/Show More
+++ b/Show More