remove default cfg and fix some bugs,test=asr

3 years ago · 3e2cc898cb
parent a1d8ab0f99
commit 3e2cc898cb
41 changed files with 97 additions and 628 deletions
--- a/examples/aishell/asr0/local/test_hub_ori
+++ b/examples/aishell/asr0/local/test_hub_ori
@ -1,47 +0,0 @@
 #!/bin/bash
 if [ $# != 4 ];then
    echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 ckpt_prefix=$2
 model_type=$3
 audio_file=$4
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
 if [ $? -ne 0 ]; then
   exit 1
 fi
 if [ ! -f ${audio_file} ]; then
    echo "Plase input the right audio_file path"
    exit 1
 fi
 # download language model
 bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
   exit 1
 fi
 python3 -u ${BIN_DIR}/test_hub.py \
 --nproc ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type} \
 --audio_file ${audio_file}
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
    exit 1
 fi
 exit 0
--- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py
+++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py
@ -15,7 +15,6 @@
 from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -42,7 +41,7 @@ if __name__ == "__main__":
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
+++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
@ -120,20 +120,6 @@ class DeepSpeech2Model(nn.Layer):
    :rtype: tuple of LayerOutput
    """
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                num_conv_layers=2,  #Number of stacking convolution layers.
                num_rnn_layers=3,  #Number of stacking RNN layers.
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self,
                 feat_size,
                 dict_size,
--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py
@ -44,22 +44,6 @@ logger = Log(__name__).getlog()
 class DeepSpeech2Trainer(Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # training config
        default = CfgNode(
            dict(
                lr=5e-4,  # learning rate
                lr_decay=1.0,  # learning rate decay
                weight_decay=1e-6,  # the coeff of weight decay
                global_grad_clip=5.0,  # the global norm clip
                n_epoch=50,  # train epochs
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
@ -246,27 +230,6 @@ class DeepSpeech2Trainer(Trainer):
 class DeepSpeech2Tester(DeepSpeech2Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # testing config
        default = CfgNode(
            dict(
                alpha=2.5,  # Coef of LM for beam search.
                beta=0.3,  # Coef of WC for beam search.
                cutoff_prob=1.0,  # Cutoff probability for pruning.
                cutoff_top_n=40,  # Cutoff number for pruning.
                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
                num_proc_bsearch=8,  # # of CPUs for beam search.
                beam_size=500,  # Beam search width.
                batch_size=128,  # decoding batch size
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        self._text_featurizer = TextFeaturizer(
--- a/examples/ted_en_zh/st0/conf/preprocess.yaml
+++ b/examples/ted_en_zh/st0/conf/preprocess.yaml
@ -0,0 +1,25 @@
 process:
  # extract kaldi fbank from PCM
  - type: fbank_kaldi
    fs: 16000
    n_mels: 80
    n_shift: 160
    win_length: 400
    dither: 0.1
  - type: cmvn_json
    cmvn_path: data/mean_std.json
  # these three processes are a.k.a. SpecAugument
  - type: time_warp
    max_time_warp: 5
    inplace: true
    mode: PIL
  - type: freq_mask
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: time_mask
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false
--- a/examples/ted_en_zh/st0/conf/transformer.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer.yaml
@ -19,7 +19,7 @@ vocab_filepath: data/lang_char/vocab.txt
 unit_type: 'spm'
 spm_model_prefix: data/lang_char/bpe_unigram_8000
 mean_std_filepath: ""
-augmentation_config: conf/preprocess.yaml
+preprocess_config: conf/preprocess.yaml
 batch_size: 16
 maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
@ -87,7 +87,7 @@ global_grad_clip: 5.0
 optim: adam
 optim_conf:
  lr: 2.5
-  weight_decay: 1e-06
+  weight_decay: 1.0e-06
 scheduler: noam    
 scheduler_conf:
  warmup_steps: 25000
--- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
@ -19,7 +19,7 @@ vocab_filepath: data/lang_char/vocab.txt
 unit_type: 'spm'
 spm_model_prefix: data/lang_char/bpe_unigram_8000
 mean_std_filepath: ""
-augmentation_config: conf/preprocess.yaml
+preprocess_config: conf/preprocess.yaml
 batch_size: 16
 maxlen_in: 5  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
--- a/examples/ted_en_zh/st1/conf/preprocess.yaml
+++ b/examples/ted_en_zh/st1/conf/preprocess.yaml
@ -0,0 +1,16 @@
 process:
  # these three processes are a.k.a. SpecAugument
  - type: time_warp
    max_time_warp: 5
    inplace: true
    mode: PIL
  - type: freq_mask
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: time_mask
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false
--- a/examples/ted_en_zh/st1/conf/transformer.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer.yaml
@ -13,7 +13,7 @@ vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
 unit_type: 'spm'
 spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
 mean_std_filepath: ""
-# augmentation_config: conf/augmentation.json
+# preprocess_config: conf/augmentation.json
 batch_size: 20
 feat_dim: 83
 stride_ms: 10.0
@ -27,7 +27,7 @@ batch_bins: 0
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
-augmentation_config:
+preprocess_config:
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
--- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@ -13,7 +13,7 @@ vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
 unit_type: 'spm'
 spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
 mean_std_filepath: ""
-# augmentation_config: conf/augmentation.json
+# preprocess_config: conf/augmentation.json
 batch_size: 20
 feat_dim: 83
 stride_ms: 10.0
@ -27,7 +27,7 @@ batch_bins: 0
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
-augmentation_config:
+preprocess_config:
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
--- a/examples/ted_en_zh/st1/local/test.sh
+++ b/examples/ted_en_zh/st1/local/test.sh
@ -20,12 +20,7 @@ for type in fullsentence; do
    --decode_cfg ${decode_config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
 <<<<<<< HEAD
    --opts decode.decoding_method ${type} \
    --opts decode.decode_batch_size ${batch_size}
 =======
    --opts decoding.decoding_method ${type} \
 >>>>>>> 6272496d9c26736750b577fd832ea9dd4ddc4e6e
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/tiny/asr1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@ -58,7 +58,6 @@ mean_std_filepath: ""
 vocab_filepath: data/lang_char/vocab.txt 
 unit_type: 'spm'
 spm_model_prefix: 'data/lang_char/bpe_unigram_200'
 preprocess_config: conf/preprocess.yaml
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
@ -72,7 +71,7 @@ batch_bins: 0
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
-augmentation_config: conf/preprocess.yaml 
+preprocess_config: conf/preprocess.yaml 
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
--- a/paddlespeech/s2t/decoders/recog.py
+++ b/paddlespeech/s2t/decoders/recog.py
@ -85,7 +85,7 @@ def recog_v2(args):
        mode="asr",
        load_output=False,
        sort_in_input_length=False,
-        preprocess_conf=confs.collator.augmentation_config
+        preprocess_conf=confs.preprocess_config
        if args.preprocess_conf is None else args.preprocess_conf,
        preprocess_args={"train": False}, )
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
@ -20,7 +20,7 @@ from paddle.inference import Config
 from paddle.inference import create_predictor
 from paddle.io import DataLoader
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
@ -176,7 +176,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
@ -18,7 +18,7 @@ import numpy as np
 import paddle
 from paddle.io import DataLoader
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
@ -111,7 +111,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Export for DeepSpeech2 model."""
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -41,7 +41,7 @@ if __name__ == "__main__":
    print_arguments(args)
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@ -14,7 +14,6 @@
 """Evaluation for DeepSpeech2 model."""
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -43,7 +42,7 @@ if __name__ == "__main__":
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@ -13,8 +13,6 @@
 # limitations under the License.
 """Evaluation for DeepSpeech2 model."""
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -48,7 +46,7 @@ if __name__ == "__main__":
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@ -188,7 +188,7 @@ if __name__ == "__main__":
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@ -14,7 +14,7 @@
 """Trainer for DeepSpeech2 model."""
 from paddle import distributed as dist
-from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -42,7 +42,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
--- a/paddlespeech/s2t/exps/deepspeech2/config.py
+++ b/paddlespeech/s2t/exps/deepspeech2/config.py
@ -1,28 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
 def get_cfg_defaults(model_type='offline'):
    _C = CfgNode()
    config = _C.clone()
    config.set_new_allowed(True)
    return config
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@ -49,22 +49,6 @@ logger = Log(__name__).getlog()
 class DeepSpeech2Trainer(Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # training config
        default = CfgNode(
            dict(
                lr=5e-4,  # learning rate
                lr_decay=1.0,  # learning rate decay
                weight_decay=1e-6,  # the coeff of weight decay
                global_grad_clip=5.0,  # the global norm clip
                n_epoch=50,  # train epochs
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
@ -259,27 +243,6 @@ class DeepSpeech2Trainer(Trainer):
 class DeepSpeech2Tester(DeepSpeech2Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # testing config
        default = CfgNode(
            dict(
                alpha=2.5,  # Coef of LM for beam search.
                beta=0.3,  # Coef of WC for beam search.
                cutoff_prob=1.0,  # Cutoff probability for pruning.
                cutoff_top_n=40,  # Cutoff number for pruning.
                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
                num_proc_bsearch=8,  # # of CPUs for beam search.
                beam_size=500,  # Beam search width.
                batch_size=128,  # decoding batch size
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
        self._text_featurizer = TextFeaturizer(
--- a/paddlespeech/s2t/exps/u2/bin/alignment.py
+++ b/paddlespeech/s2t/exps/u2/bin/alignment.py
@ -14,7 +14,6 @@
 """Alignment for U2 model."""
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -40,7 +39,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/s2t/exps/u2/bin/export.py
+++ b/paddlespeech/s2t/exps/u2/bin/export.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Export for U2 model."""
-from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -38,7 +38,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
--- a/paddlespeech/s2t/exps/u2/bin/test.py
+++ b/paddlespeech/s2t/exps/u2/bin/test.py
@ -16,7 +16,6 @@ import cProfile
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -44,7 +43,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@ -20,7 +20,6 @@ import paddle
 import soundfile
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.models.u2 import U2Model
 from paddlespeech.s2t.training.cli import default_argument_parser
@ -129,7 +128,7 @@ if __name__ == "__main__":
        "--audio_file", type=str, help="path of the input audio file")
    args = parser.parse_args()
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
@ -17,7 +17,7 @@ import os
 from paddle import distributed as dist
-from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -44,7 +44,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@ -46,33 +46,6 @@ logger = Log(__name__).getlog()
 class U2Trainer(Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # training config
        default = CfgNode(
            dict(
                n_epoch=50,  # train epochs
                log_interval=100,  # steps
                accum_grad=1,  # accum grad by # steps
                global_grad_clip=5.0,  # the global norm clip
            ))
        default.optim = 'adam'
        default.optim_conf = CfgNode(
            dict(
                lr=5e-4,  # learning rate
                weight_decay=1e-6,  # the coeff of weight decay
            ))
        default.scheduler = 'warmuplr'
        default.scheduler_conf = CfgNode(
            dict(
                warmup_steps=25000,
                lr_decay=1.0,  # learning rate decay
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
@ -401,35 +374,6 @@ class U2Trainer(Trainer):
 class U2Tester(U2Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # decoding config
        default = CfgNode(
            dict(
                alpha=2.5,  # Coef of LM for beam search.
                beta=0.3,  # Coef of WC for beam search.
                cutoff_prob=1.0,  # Cutoff probability for pruning.
                cutoff_top_n=40,  # Cutoff number for pruning.
                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
                decoding_method='attention',  # Decoding method. Options: 'attention', 'ctc_greedy_search',
                # 'ctc_prefix_beam_search', 'attention_rescoring'
                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
                num_proc_bsearch=8,  # # of CPUs for beam search.
                beam_size=10,  # Beam search width.
                decode_batch_size=16,  # decoding batch size
                ctc_weight=0.0,  # ctc weight for attention rescoring decode mode.
                decoding_chunk_size=-1,  # decoding chunk size. Defaults to -1.
                # <0: for decoding, use full chunk.
                # >0: for decoding, use fixed chunk size as set.
                # 0: used for training, it's prohibited here.
                num_decoding_left_chunks=-1,  # number of left chunks for decoding. Defaults to -1.
                simulate_streaming=False,  # simulate streaming inference. Defaults to False.
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
        self.text_feature = TextFeaturizer(
--- a/paddlespeech/s2t/exps/u2/trainer.py
+++ b/paddlespeech/s2t/exps/u2/trainer.py
@ -55,7 +55,6 @@ class U2Trainer(Trainer):
        collate_fn_train = SpeechCollator.from_config(config)
        config.augmentation_config = ""
        collate_fn_dev = SpeechCollator.from_config(config)
        if self.parallel:
@ -103,7 +102,6 @@ class U2Trainer(Trainer):
        test_dataset = ManifestDataset.from_config(config)
        # return text ord id
        config.keep_transcription_text = True
        config.augmentation_config = ""
        self.test_loader = DataLoader(
            test_dataset,
            batch_size=config.decode.batch_size,
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@ -42,40 +42,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
 def get_cfg_defaults():
    """Get a yacs CfgNode object with default values for my_project."""
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern
    _C = CfgNode()
    _C.model = U2Model.params()
    _C.training = U2Trainer.params()
    _C.decoding = U2Tester.params()
    config = _C.clone()
    config.set_new_allowed(True)
    return config
 class U2Trainer(Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # training config
        default = CfgNode(
            dict(
                n_epoch=50,  # train epochs
                log_interval=100,  # steps
                accum_grad=1,  # accum grad by # steps
                checkpoint=dict(
                    kbest_n=50,
                    latest_n=5, ), ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
@ -362,35 +329,6 @@ class U2Trainer(Trainer):
 class U2Tester(U2Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # decoding config
        default = CfgNode(
            dict(
                alpha=2.5,  # Coef of LM for beam search.
                beta=0.3,  # Coef of WC for beam search.
                cutoff_prob=1.0,  # Cutoff probability for pruning.
                cutoff_top_n=40,  # Cutoff number for pruning.
                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
                decoding_method='attention',  # Decoding method. Options: 'attention', 'ctc_greedy_search',
                # 'ctc_prefix_beam_search', 'attention_rescoring'
                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
                num_proc_bsearch=8,  # # of CPUs for beam search.
                beam_size=10,  # Beam search width.
                batch_size=16,  # decoding batch size
                ctc_weight=0.0,  # ctc weight for attention rescoring decode mode.
                decoding_chunk_size=-1,  # decoding chunk size. Defaults to -1.
                # <0: for decoding, use full chunk.
                # >0: for decoding, use fixed chunk size as set.
                # 0: used for training, it's prohibited here.
                num_decoding_left_chunks=-1,  # number of left chunks for decoding. Defaults to -1.
                simulate_streaming=False,  # simulate streaming inference. Defaults to False.
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
        self.text_feature = TextFeaturizer(
--- a/paddlespeech/s2t/exps/u2_st/bin/export.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/export.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Export for U2 model."""
-from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
+from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -38,7 +38,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
--- a/paddlespeech/s2t/exps/u2_st/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/test.py
@ -16,7 +16,6 @@ import cProfile
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -44,7 +43,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.decode_cfg:
--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
@ -16,8 +16,8 @@ import cProfile
 import os
 from paddle import distributed as dist
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults
 from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.utility import print_arguments
@ -42,7 +42,7 @@ if __name__ == "__main__":
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
--- a/paddlespeech/s2t/exps/u2_st/config.py
+++ b/paddlespeech/s2t/exps/u2_st/config.py
@ -1,41 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from yacs.config import CfgNode
 from paddlespeech.s2t.exps.u2_st.model import U2STTester
 from paddlespeech.s2t.exps.u2_st.model import U2STTrainer
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.models.u2_st import U2STModel
 _C = CfgNode()
 # _C.data = ManifestDataset.params()
 # _C.collator = SpeechCollator.params()
 # _C.model = U2STModel.params()
 # _C.training = U2STTrainer.params()
 # _C.decoding = U2STTester.params()
 def get_cfg_defaults():
    """Get a yacs CfgNode object with default values for my_project."""
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern
    config = _C.clone()
    config.set_new_allowed(True)
    return config
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@ -45,33 +45,6 @@ logger = Log(__name__).getlog()
 class U2STTrainer(Trainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # training config
        default = CfgNode(
            dict(
                n_epoch=50,  # train epochs
                log_interval=100,  # steps
                accum_grad=1,  # accum grad by # steps
                global_grad_clip=5.0,  # the global norm clip
            ))
        default.optim = 'adam'
        default.optim_conf = CfgNode(
            dict(
                lr=5e-4,  # learning rate
                weight_decay=1e-6,  # the coeff of weight decay
            ))
        default.scheduler = 'warmuplr'
        default.scheduler_conf = CfgNode(
            dict(
                warmup_steps=25000,
                lr_decay=1.0,  # learning rate decay
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
@ -127,7 +100,7 @@ class U2STTrainer(Trainer):
        for k, v in losses_np.items():
            report(k, v)
-        report("batch_size", self.config.collator.batch_size)
+        report("batch_size", self.config.batch_size)
        report("accum", train_conf.accum_grad)
        report("step_cost", iteration_time)
@ -236,7 +209,7 @@ class U2STTrainer(Trainer):
                            msg += ","
                        msg = msg[:-1]  # remove the last ","
                        if (batch_index + 1
-                            ) % self.config.training.log_interval == 0:
+                            ) % self.config.log_interval == 0:
                            logger.info(msg)
                except Exception as e:
                    logger.error(e)
@ -287,7 +260,7 @@ class U2STTrainer(Trainer):
                batch_frames_in=0,
                batch_frames_out=0,
                batch_frames_inout=0,
-                preprocess_conf=config.augmentation_config,  # aug will be off when train_mode=False
+                preprocess_conf=config.preprocess_config,  # aug will be off when train_mode=False
                n_iter_processes=config.num_workers,
                subsampling_factor=1,
                load_aux_output=load_transcript,
@ -308,7 +281,7 @@ class U2STTrainer(Trainer):
                batch_frames_in=0,
                batch_frames_out=0,
                batch_frames_inout=0,
-                preprocess_conf=config.augmentation_config,  # aug will be off when train_mode=False
+                preprocess_conf=config.preprocess_config,  # aug will be off when train_mode=False
                n_iter_processes=config.num_workers,
                subsampling_factor=1,
                load_aux_output=load_transcript,
@ -319,7 +292,7 @@ class U2STTrainer(Trainer):
            # test dataset, return raw text
            decode_batch_size = config.get('decode',dict()).get('decode_batch_size', 1)
            self.test_loader = BatchDataLoader(
-                json_file=config.data.test_manifest,
+                json_file=config.test_manifest,
                train_mode=False,
                sortagrad=False,
                batch_size=decode_batch_size,
@ -332,7 +305,7 @@ class U2STTrainer(Trainer):
                batch_frames_in=0,
                batch_frames_out=0,
                batch_frames_inout=0,
-                preprocess_conf=config.augmentation_config,  # aug will be off when train_mode=False
+                preprocess_conf=config.preprocess_config,  # aug will be off when train_mode=False
                n_iter_processes=config.num_workers,
                subsampling_factor=1,
                num_encs=1,
@ -379,7 +352,7 @@ class U2STTrainer(Trainer):
                config,
                parameters,
                lr_scheduler=None, ):
-            train_config = config.training
+            train_config = config
            optim_type = train_config.optim
            optim_conf = train_config.optim_conf
            scheduler_type = train_config.scheduler
@ -405,41 +378,12 @@ class U2STTrainer(Trainer):
 class U2STTester(U2STTrainer):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # decoding config
        default = CfgNode(
            dict(
                alpha=2.5,  # Coef of LM for beam search.
                beta=0.3,  # Coef of WC for beam search.
                cutoff_prob=1.0,  # Cutoff probability for pruning.
                cutoff_top_n=40,  # Cutoff number for pruning.
                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
                decoding_method='attention',  # Decoding method. Options: 'attention', 'ctc_greedy_search',
                # 'ctc_prefix_beam_search', 'attention_rescoring'
                error_rate_type='bleu',  # Error rate type for evaluation. Options `bleu`, 'char_bleu'
                num_proc_bsearch=8,  # # of CPUs for beam search.
                beam_size=10,  # Beam search width.
                batch_size=16,  # decoding batch size
                ctc_weight=0.0,  # ctc weight for attention rescoring decode mode.
                decoding_chunk_size=-1,  # decoding chunk size. Defaults to -1.
                # <0: for decoding, use full chunk.
                # >0: for decoding, use fixed chunk size as set.
                # 0: used for training, it's prohibited here.
                num_decoding_left_chunks=-1,  # number of left chunks for decoding. Defaults to -1.
                simulate_streaming=False,  # simulate streaming inference. Defaults to False.
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self, config, args):
        super().__init__(config, args)
        self.text_feature = TextFeaturizer(
-            unit_type=self.config.collator.unit_type,
+            unit_type=self.config.unit_type,
-            vocab_filepath=self.config.collator.vocab_filepath,
+            vocab=self.config.vocab_filepath,
-            spm_model_prefix=self.config.collator.spm_model_prefix)
+            spm_model_prefix=self.config.spm_model_prefix)
        self.vocab_list = self.text_feature.vocab_list
    def id2token(self, texts, texts_len, text_feature):
@ -526,7 +470,7 @@ class U2STTester(U2STTrainer):
        decode_cfg = self.config.decode
        bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu
-        stride_ms = self.config.collator.stride_ms
+        stride_ms = self.config.stride_ms
        hyps, refs = [], []
        len_refs, num_ins = 0, 0
        num_frames = 0.0
--- a/paddlespeech/s2t/io/collator.py
+++ b/paddlespeech/s2t/io/collator.py
@ -219,33 +219,6 @@ class SpeechCollatorBase():
 class SpeechCollator(SpeechCollatorBase):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                augmentation_config="",
                random_seed=0,
                mean_std_filepath="",
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                spectrum_type='linear',  # 'linear', 'mfcc', 'fbank'
                feat_dim=0,  # 'mfcc', 'fbank'
                delta_delta=False,  # 'mfcc', 'fbank'
                stride_ms=10.0,  # ms
                window_ms=20.0,  # ms
                n_fft=None,  # fft points
                max_freq=None,  # None for samplerate/2
                target_sample_rate=16000,  # target sample rate
                use_dB_normalization=True,
                target_dB=-20,
                dither=1.0,  # feature dither
                keep_transcription_text=False))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    @classmethod
    def from_config(cls, config):
        """Build a SpeechCollator object from a config.
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@ -28,22 +28,6 @@ logger = Log(__name__).getlog()
 class ManifestDataset(Dataset):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                manifest="",
                max_input_len=27.0,
                min_input_len=0.0,
                max_output_len=float('inf'),
                min_output_len=0.0,
                max_output_input_ratio=float('inf'),
                min_output_input_ratio=0.0, ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    @classmethod
    def from_config(cls, config):
        """Build a ManifestDataset object from a config.
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@ -119,21 +119,6 @@ class DeepSpeech2Model(nn.Layer):
             before softmax) and a ctc cost layer.
    :rtype: tuple of LayerOutput
    """
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                num_conv_layers=2,  #Number of stacking convolution layers.
                num_rnn_layers=3,  #Number of stacking RNN layers.
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                share_rnn_weights=True,  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
                ctc_grad_norm_type=None, ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self,
                 feat_size,
                 dict_size,
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
@ -243,23 +243,6 @@ class DeepSpeech2ModelOnline(nn.Layer):
             before softmax) and a ctc cost layer.
    :rtype: tuple of LayerOutput
    """
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                num_conv_layers=2,  #Number of stacking convolution layers.
                num_rnn_layers=4,  #Number of stacking RNN layers.
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                num_fc_layers=2,
                fc_layers_size_list=[512, 256],
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                blank_id=0,  # index of blank in vocob.txt
                ctc_grad_norm_type=None, ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(
            self,
            feat_size,
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@ -59,57 +59,6 @@ logger = Log(__name__).getlog()
 class U2BaseModel(ASRInterface, nn.Layer):
    """CTC-Attention hybrid Encoder-Decoder model"""
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # network architecture
        default = CfgNode()
        # allow add new item when merge_with_file
        default.cmvn_file = ""
        default.cmvn_file_type = "json"
        default.input_dim = 0
        default.output_dim = 0
        # encoder related
        default.encoder = 'transformer'
        default.encoder_conf = CfgNode(
            dict(
                output_size=256,  # dimension of attention
                attention_heads=4,
                linear_units=2048,  # the number of units of position-wise feed forward
                num_blocks=12,  # the number of encoder blocks
                dropout_rate=0.1,
                positional_dropout_rate=0.1,
                attention_dropout_rate=0.0,
                input_layer='conv2d',  # encoder input type, you can chose conv2d, conv2d6 and conv2d8
                normalize_before=True,
                # use_cnn_module=True,
                # cnn_module_kernel=15,
                # activation_type='swish',
                # pos_enc_layer_type='rel_pos',
                # selfattention_layer_type='rel_selfattn',
            ))
        # decoder related
        default.decoder = 'transformer'
        default.decoder_conf = CfgNode(
            dict(
                attention_heads=4,
                linear_units=2048,
                num_blocks=6,
                dropout_rate=0.1,
                positional_dropout_rate=0.1,
                self_attention_dropout_rate=0.0,
                src_attention_dropout_rate=0.0, ))
        # hybrid CTC/attention
        default.model_conf = CfgNode(
            dict(
                ctc_weight=0.3,
                lsm_weight=0.1,  # label smoothing option
                length_normalized_loss=False, ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self,
                 vocab_size: int,
                 encoder: TransformerEncoder,
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@ -51,58 +51,6 @@ logger = Log(__name__).getlog()
 class U2STBaseModel(nn.Layer):
    """CTC-Attention hybrid Encoder-Decoder model"""
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # network architecture
        default = CfgNode()
        # allow add new item when merge_with_file
        default.cmvn_file = ""
        default.cmvn_file_type = "json"
        default.input_dim = 0
        default.output_dim = 0
        # encoder related
        default.encoder = 'transformer'
        default.encoder_conf = CfgNode(
            dict(
                output_size=256,  # dimension of attention
                attention_heads=4,
                linear_units=2048,  # the number of units of position-wise feed forward
                num_blocks=12,  # the number of encoder blocks
                dropout_rate=0.1,
                positional_dropout_rate=0.1,
                attention_dropout_rate=0.0,
                input_layer='conv2d',  # encoder input type, you can chose conv2d, conv2d6 and conv2d8
                normalize_before=True,
                # use_cnn_module=True,
                # cnn_module_kernel=15,
                # activation_type='swish',
                # pos_enc_layer_type='rel_pos',
                # selfattention_layer_type='rel_selfattn', 
            ))
        # decoder related
        default.decoder = 'transformer'
        default.decoder_conf = CfgNode(
            dict(
                attention_heads=4,
                linear_units=2048,
                num_blocks=6,
                dropout_rate=0.1,
                positional_dropout_rate=0.1,
                self_attention_dropout_rate=0.0,
                src_attention_dropout_rate=0.0, ))
        # hybrid CTC/attention
        default.model_conf = CfgNode(
            dict(
                asr_weight=0.0,
                ctc_weight=0.0,
                lsm_weight=0.1,  # label smoothing option
                length_normalized_loss=False, ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    def __init__(self,
                 vocab_size: int,
                 encoder: TransformerEncoder,