From 1cd4d4bf83b705378ab30c1e26d672d30cd13cbe Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 5 Aug 2021 10:12:22 +0000
Subject: [PATCH] fix tiny conf and refactor optimizer and scheduler

---
 deepspeech/exps/u2/model.py                   |  87 ++++--
 deepspeech/exps/u2_st/model.py                |   5 +-
 deepspeech/models/deepspeech2.py              | 262 ------------------
 deepspeech/training/optimizer.py              |  81 ++++++
 deepspeech/training/scheduler.py              |  47 +++-
 deepspeech/utils/dynamic_import.py            |  50 ++++
 examples/librispeech/s1/conf/transformer.yaml |   4 +-
 examples/tiny/s1/conf/chunk_confermer.yaml    |  16 +-
 examples/tiny/s1/conf/chunk_transformer.yaml  |  14 +-
 examples/tiny/s1/conf/conformer.yaml          |  16 +-
 examples/tiny/s1/conf/transformer.yaml        |  27 +-
 11 files changed, 280 insertions(+), 329 deletions(-)
 delete mode 100644 deepspeech/models/deepspeech2.py
 create mode 100644 deepspeech/training/optimizer.py
 create mode 100644 deepspeech/utils/dynamic_import.py

diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index dd62f537..34145780 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -31,8 +31,8 @@ from deepspeech.io.dataset import ManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2 import U2Model
-from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
-from deepspeech.training.scheduler import WarmupLR
+from deepspeech.training.optimizer import OptimizerFactory
+from deepspeech.training.scheduler import LRSchedulerFactory
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
@@ -41,6 +41,8 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
+# from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
+# from deepspeech.training.scheduler import WarmupLR
 
 logger = Log(__name__).getlog()
 
@@ -312,30 +314,63 @@ class U2Trainer(Trainer):
         scheduler_type = train_config.scheduler
         scheduler_conf = train_config.scheduler_conf
 
-        grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip)
-        weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay)
-
-        if scheduler_type == 'expdecaylr':
-            lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
-                learning_rate=optim_conf.lr,
-                gamma=scheduler_conf.lr_decay,
-                verbose=False)
-        elif scheduler_type == 'warmuplr':
-            lr_scheduler = WarmupLR(
-                learning_rate=optim_conf.lr,
-                warmup_steps=scheduler_conf.warmup_steps,
-                verbose=False)
-        else:
-            raise ValueError(f"Not support scheduler: {scheduler_type}")
-
-        if optim_type == 'adam':
-            optimizer = paddle.optimizer.Adam(
-                learning_rate=lr_scheduler,
-                parameters=model.parameters(),
-                weight_decay=weight_decay,
-                grad_clip=grad_clip)
-        else:
-            raise ValueError(f"Not support optim: {optim_type}")
+        scheduler_args = {
+            "learning_rate":
+            optim_conf.lr,
+            "verbose":
+            False,
+            "warmup_steps":
+            scheduler_conf.warmup_steps
+            if "warmup_steps" in scheduler_conf else None,
+            "gamma":
+            scheduler_conf.lr_decay if "lr_decay" in scheduler_conf else None,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+                                                    scheduler_args)
+
+        # if scheduler_type == 'expdecaylr':
+        #     lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
+        #         learning_rate=optim_conf.lr,
+        #         gamma=scheduler_conf.lr_decay,
+        #         verbose=False)
+        # elif scheduler_type == 'warmuplr':
+        #     lr_scheduler = WarmupLR(
+        #         learning_rate=optim_conf.lr,
+        #         warmup_steps=scheduler_conf.warmup_steps,
+        #         verbose=False)
+        # else:
+        #     raise ValueError(f"Not support scheduler: {scheduler_type}")
+
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            train_config = config.training
+            optim_type = train_config.optim
+            optim_conf = train_config.optim_conf
+            scheduler_type = train_config.scheduler
+            scheduler_conf = train_config.scheduler_conf
+            return {
+                "grad_clip": train_config.global_grad_clip,
+                "weight_decay": optim_conf.weight_decay,
+                "learning_rate": lr_scheduler
+                if lr_scheduler else optim_conf.lr,
+                "parameters": parameters,
+            }
+
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+
+        # grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip)
+        # weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay)
+        # if optim_type == 'adam':
+        #     optimizer = paddle.optimizer.Adam(
+        #         learning_rate=lr_scheduler,
+        #         parameters=model.parameters(),
+        #         weight_decay=weight_decay,
+        #         grad_clip=grad_clip)
+        # else:
+        #     raise ValueError(f"Not support optim: {optim_type}")
 
         self.model = model
         self.optimizer = optimizer
diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py
index f72e2bbc..5734e15f 100644
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -345,9 +345,6 @@ class U2STTrainer(Trainer):
         scheduler_type = train_config.scheduler
         scheduler_conf = train_config.scheduler_conf
 
-        grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip)
-        weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay)
-
         if scheduler_type == 'expdecaylr':
             lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
                 learning_rate=optim_conf.lr,
@@ -367,6 +364,8 @@ class U2STTrainer(Trainer):
         else:
             raise ValueError(f"Not support scheduler: {scheduler_type}")
 
+        grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip)
+        weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay)
         if optim_type == 'adam':
             optimizer = paddle.optimizer.Adam(
                 learning_rate=lr_scheduler,
diff --git a/deepspeech/models/deepspeech2.py b/deepspeech/models/deepspeech2.py
deleted file mode 100644
index 233986a9..00000000
--- a/deepspeech/models/deepspeech2.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Deepspeech2 ASR Model"""
-from typing import Optional
-
-import paddle
-from paddle import nn
-from yacs.config import CfgNode
-
-from deepspeech.modules.conv import ConvStack
-from deepspeech.modules.ctc import CTCDecoder
-from deepspeech.modules.rnn import RNNStack
-from deepspeech.utils import layer_tools
-from deepspeech.utils.checkpoint import Checkpoint
-from deepspeech.utils.log import Log
-
-logger = Log(__name__).getlog()
-
-__all__ = ['DeepSpeech2Model']
-
-
-class CRNNEncoder(nn.Layer):
-    def __init__(self,
-                 feat_size,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=1024,
-                 use_gru=False,
-                 share_rnn_weights=True):
-        super().__init__()
-        self.rnn_size = rnn_size
-        self.feat_size = feat_size  # 161 for linear
-        self.dict_size = dict_size
-
-        self.conv = ConvStack(feat_size, num_conv_layers)
-
-        i_size = self.conv.output_height  # H after conv stack
-        self.rnn = RNNStack(
-            i_size=i_size,
-            h_size=rnn_size,
-            num_stacks=num_rnn_layers,
-            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
-
-    @property
-    def output_size(self):
-        return self.rnn_size * 2
-
-    def forward(self, audio, audio_len):
-        """Compute Encoder outputs
-
-        Args:
-            audio (Tensor): [B, Tmax, D]
-            text (Tensor): [B, Umax]
-            audio_len (Tensor): [B]
-            text_len (Tensor): [B]
-        Returns:
-            x (Tensor): encoder outputs, [B, T, D]
-            x_lens (Tensor): encoder length, [B]
-        """
-        # [B, T, D]  -> [B, D, T]
-        audio = audio.transpose([0, 2, 1])
-        # [B, D, T] -> [B, C=1, D, T]
-        x = audio.unsqueeze(1)
-        x_lens = audio_len
-
-        # convolution group
-        x, x_lens = self.conv(x, x_lens)
-
-        # convert data from convolution feature map to sequence of vectors
-        #B, C, D, T = paddle.shape(x)  # not work under jit
-        x = x.transpose([0, 3, 1, 2])  #[B, T, C, D]
-        #x = x.reshape([B, T, C * D])  #[B, T, C*D]  # not work under jit
-        x = x.reshape([0, 0, -1])  #[B, T, C*D]
-
-        # remove padding part
-        x, x_lens = self.rnn(x, x_lens)  #[B, T, D]
-        return x, x_lens
-
-
-class DeepSpeech2Model(nn.Layer):
-    """The DeepSpeech2 network structure.
-
-    :param audio_data: Audio spectrogram data layer.
-    :type audio_data: Variable
-    :param text_data: Transcription text data layer.
-    :type text_data: Variable
-    :param audio_len: Valid sequence length data layer.
-    :type audio_len: Variable
-    :param masks: Masks data layer to reset padding.
-    :type masks: Variable
-    :param dict_size: Dictionary size for tokenized transcription.
-    :type dict_size: int
-    :param num_conv_layers: Number of stacking convolution layers.
-    :type num_conv_layers: int
-    :param num_rnn_layers: Number of stacking RNN layers.
-    :type num_rnn_layers: int
-    :param rnn_size: RNN layer size (dimension of RNN cells).
-    :type rnn_size: int
-    :param use_gru: Use gru if set True. Use simple rnn if set False.
-    :type use_gru: bool
-    :param share_rnn_weights: Whether to share input-hidden weights between
-                              forward and backward direction RNNs.
-                              It is only available when use_gru=False.
-    :type share_weights: bool
-    :return: A tuple of an output unnormalized log probability layer (
-             before softmax) and a ctc cost layer.
-    :rtype: tuple of LayerOutput    
-    """
-
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                num_conv_layers=2,  #Number of stacking convolution layers.
-                num_rnn_layers=3,  #Number of stacking RNN layers.
-                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-            ))
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
-    def __init__(self,
-                 feat_size,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=1024,
-                 use_gru=False,
-                 share_rnn_weights=True):
-        super().__init__()
-        self.encoder = CRNNEncoder(
-            feat_size=feat_size,
-            dict_size=dict_size,
-            num_conv_layers=num_conv_layers,
-            num_rnn_layers=num_rnn_layers,
-            rnn_size=rnn_size,
-            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
-        assert (self.encoder.output_size == rnn_size * 2)
-
-        self.decoder = CTCDecoder(
-            odim=dict_size,  # <blank> is in  vocab
-            enc_n_units=self.encoder.output_size,
-            blank_id=0,  # first token is <blank>
-            dropout_rate=0.0,
-            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
-
-    def forward(self, audio, audio_len, text, text_len):
-        """Compute Model loss
-
-        Args:
-            audio (Tenosr): [B, T, D]
-            audio_len (Tensor): [B]
-            text (Tensor): [B, U]
-            text_len (Tensor): [B]
-
-        Returns:
-            loss (Tenosr): [1]
-        """
-        eouts, eouts_len = self.encoder(audio, audio_len)
-        loss = self.decoder(eouts, eouts_len, text, text_len)
-        return loss
-
-    @paddle.no_grad()
-    def decode(self, audio, audio_len, vocab_list, decoding_method,
-               lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
-               cutoff_top_n, num_processes):
-        # init once
-        # decoders only accept string encoded in utf-8
-        self.decoder.init_decode(
-            beam_alpha=beam_alpha,
-            beam_beta=beam_beta,
-            lang_model_path=lang_model_path,
-            vocab_list=vocab_list,
-            decoding_method=decoding_method)
-
-        eouts, eouts_len = self.encoder(audio, audio_len)
-        probs = self.decoder.softmax(eouts)
-        return self.decoder.decode_probs(
-            probs.numpy(), eouts_len, vocab_list, decoding_method,
-            lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
-            cutoff_top_n, num_processes)
-
-    @classmethod
-    def from_pretrained(cls, dataloader, config, checkpoint_path):
-        """Build a DeepSpeech2Model model from a pretrained model.
-        Parameters
-        ----------
-        dataloader: paddle.io.DataLoader
-
-        config: yacs.config.CfgNode
-            model configs
-        
-        checkpoint_path: Path or str
-            the path of pretrained model checkpoint, without extension name
-        
-        Returns
-        -------
-        DeepSpeech2Model
-            The model built from pretrained result.
-        """
-        model = cls(feat_size=dataloader.collate_fn.feature_size,
-                    dict_size=dataloader.collate_fn.vocab_size,
-                    num_conv_layers=config.model.num_conv_layers,
-                    num_rnn_layers=config.model.num_rnn_layers,
-                    rnn_size=config.model.rnn_layer_size,
-                    use_gru=config.model.use_gru,
-                    share_rnn_weights=config.model.share_rnn_weights)
-        infos = Checkpoint().load_parameters(
-            model, checkpoint_path=checkpoint_path)
-        logger.info(f"checkpoint info: {infos}")
-        layer_tools.summary(model)
-        return model
-
-
-class DeepSpeech2InferModel(DeepSpeech2Model):
-    def __init__(self,
-                 feat_size,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=1024,
-                 use_gru=False,
-                 share_rnn_weights=True):
-        super().__init__(
-            feat_size=feat_size,
-            dict_size=dict_size,
-            num_conv_layers=num_conv_layers,
-            num_rnn_layers=num_rnn_layers,
-            rnn_size=rnn_size,
-            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
-
-    def forward(self, audio, audio_len):
-        """export model function
-
-        Args:
-            audio (Tensor): [B, T, D]
-            audio_len (Tensor): [B]
-
-        Returns:
-            probs: probs after softmax
-        """
-        eouts, eouts_len = self.encoder(audio, audio_len)
-        probs = self.decoder.softmax(eouts)
-        return probs
diff --git a/deepspeech/training/optimizer.py b/deepspeech/training/optimizer.py
new file mode 100644
index 00000000..adbc97ff
--- /dev/null
+++ b/deepspeech/training/optimizer.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from typing import Dict
+from typing import Text
+
+from paddle.optimizer import Optimizer
+from paddle.regularizer import L2Decay
+
+from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
+from deepspeech.utils.dynamic_import import dynamic_import
+from deepspeech.utils.log import Log
+
+__all__ = ["OptimizerFactory"]
+
+logger = Log(__name__).getlog()
+
+OPTIMIZER_DICT = {
+    "sgd": "paddle.optimizer:SGD",
+    "momentum": "paddle.optimizer:Momentum",
+    "adadelta": "paddle.optimizer:Adadelta",
+    "adam": "paddle.optimizer:Adam",
+    "adamw": "paddle.optimizer:AdamW",
+}
+
+
+def register_optimizer(cls):
+    """Register optimizer."""
+    alias = cls.__name__.lower()
+    OPTIMIZER_DICT[cls.__name__.lower()] = cls.__module__ + ":" + cls.__name__
+    return cls
+
+
+def dynamic_import_optimizer(module):
+    """Import Optimizer class dynamically.
+
+    Args:
+        module (str): module_name:class_name or alias in `OPTIMIZER_DICT`
+
+    Returns:
+        type: Optimizer class
+
+    """
+    module_class = dynamic_import(module, OPTIMIZER_DICT)
+    assert issubclass(module_class,
+                      Optimizer), f"{module} does not implement Optimizer"
+    return module_class
+
+
+class OptimizerFactory():
+    @classmethod
+    def from_args(cls, name: str, args: Dict[Text, Any]):
+        assert "parameters" in args, "parameters not in args."
+        assert "learning_rate" in args, "learning_rate not in args."
+
+        grad_clip = ClipGradByGlobalNormWithLog(
+            args['grad_clip']) if "grad_clip" in args else None
+        weight_decay = L2Decay(
+            args['weight_decay']) if "weight_decay" in args else None
+        module_class = dynamic_import_optimizer(name.lower())
+
+        if weight_decay:
+            logger.info(f'WeightDecay: {weight_decay}')
+        if grad_clip:
+            logger.info(f'GradClip: {grad_clip}')
+        logger.info(
+            f"Optimizer: {module_class.__name__} {args['learning_rate']}")
+
+        args.update({"grad_clip": grad_clip, "weight_decay": weight_decay})
+        return module_class(**args)
diff --git a/deepspeech/training/scheduler.py b/deepspeech/training/scheduler.py
index d3613028..b8f3ece7 100644
--- a/deepspeech/training/scheduler.py
+++ b/deepspeech/training/scheduler.py
@@ -11,18 +11,53 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Any
+from typing import Dict
+from typing import Text
 from typing import Union
 
 from paddle.optimizer.lr import LRScheduler
 from typeguard import check_argument_types
 
+from deepspeech.utils.dynamic_import import dynamic_import
+from deepspeech.utils.dynamic_import import instance_class
 from deepspeech.utils.log import Log
 
-__all__ = ["WarmupLR"]
+__all__ = ["WarmupLR", "LRSchedulerFactory"]
 
 logger = Log(__name__).getlog()
 
+SCHEDULER_DICT = {
+    "noam": "paddle.optimizer.lr:NoamDecay",
+    "expdecaylr": "paddle.optimizer.lr:ExponentialDecay",
+    "piecewisedecay": "paddle.optimizer.lr:PiecewiseDecay",
+}
 
+
+def register_scheduler(cls):
+    """Register scheduler."""
+    alias = cls.__name__.lower()
+    SCHEDULER_DICT[cls.__name__.lower()] = cls.__module__ + ":" + cls.__name__
+    return cls
+
+
+def dynamic_import_scheduler(module):
+    """Import Scheduler class dynamically.
+
+    Args:
+        module (str): module_name:class_name or alias in `SCHEDULER_DICT`
+
+    Returns:
+        type: Scheduler class
+
+    """
+    module_class = dynamic_import(module, SCHEDULER_DICT)
+    assert issubclass(module_class,
+                      LRScheduler), f"{module} does not implement LRScheduler"
+    return module_class
+
+
+@register_scheduler
 class WarmupLR(LRScheduler):
     """The WarmupLR scheduler
     This scheduler is almost same as NoamLR Scheduler except for following
@@ -40,7 +75,8 @@ class WarmupLR(LRScheduler):
                  warmup_steps: Union[int, float]=25000,
                  learning_rate=1.0,
                  last_epoch=-1,
-                 verbose=False):
+                 verbose=False,
+                 **kwargs):
         assert check_argument_types()
         self.warmup_steps = warmup_steps
         super().__init__(learning_rate, last_epoch, verbose)
@@ -64,3 +100,10 @@ class WarmupLR(LRScheduler):
             None
         '''
         self.step(epoch=step)
+
+
+class LRSchedulerFactory():
+    @classmethod
+    def from_args(cls, name: str, args: Dict[Text, Any]):
+        module_class = dynamic_import_scheduler(name.lower())
+        return instance_class(module_class, args)
diff --git a/deepspeech/utils/dynamic_import.py b/deepspeech/utils/dynamic_import.py
new file mode 100644
index 00000000..81586e3e
--- /dev/null
+++ b/deepspeech/utils/dynamic_import.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import Any
+from typing import Dict
+from typing import Text
+
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ["dynamic_import", "instance_class"]
+
+
+def dynamic_import(import_path, alias=dict()):
+    """dynamic import module and class
+
+    :param str import_path: syntax 'module_name:class_name'
+        e.g., 'deepspeech.models.u2:U2Model'
+    :param dict alias: shortcut for registered class
+    :return: imported class
+    """
+    if import_path not in alias and ":" not in import_path:
+        raise ValueError("import_path should be one of {} or "
+                         'include ":", e.g. "deepspeech.models.u2:U2Model" : '
+                         "{}".format(set(alias), import_path))
+    if ":" not in import_path:
+        import_path = alias[import_path]
+
+    module_name, objname = import_path.split(":")
+    m = importlib.import_module(module_name)
+    return getattr(m, objname)
+
+
+def instance_class(module_class, args: Dict[Text, Any]):
+    # filter out `val` which is None
+    new_args = {key: val for key, val in args.items() if val is not None}
+    logger.info(f"Instance: {module_class.__name__} {new_args}.")
+    return module_class(**new_args)
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml
index 8a769dca..26188677 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -16,7 +16,7 @@ collator:
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/augmentation.json
-  batch_size: 64
+  batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
   specgram_type: fbank #linear, mfcc, fbank
   feat_dim: 80
@@ -73,7 +73,7 @@ model:
 
 training:
   n_epoch: 120
-  accum_grad: 2
+  accum_grad: 4
   global_grad_clip: 5.0
   optim: adam
   optim_conf:
diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml
index 606300bd..1b701aa2 100644
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -3,18 +3,20 @@ data:
   train_manifest: data/manifest.tiny
   dev_manifest: data/manifest.tiny
   test_manifest: data/manifest.tiny
+  min_input_len: 0.5  # second
+  max_input_len: 20.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+  
+collator:
+  mean_std_filepath: ""
   vocab_filepath: data/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_200'
-  mean_std_filepath: ""
   augmentation_config: conf/augmentation.json
   batch_size: 4
-  min_input_len: 0.5
-  max_input_len: 20.0
-  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
   raw_wav: True  # use raw_wav or kaldi feature
   specgram_type: fbank #linear, mfcc, fbank
   feat_dim: 80
diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml
index 72d36848..31dfd26c 100644
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -3,18 +3,20 @@ data:
   train_manifest: data/manifest.tiny
   dev_manifest: data/manifest.tiny
   test_manifest: data/manifest.tiny
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
-  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
-  batch_size: 4
   min_input_len: 0.5  # second
   max_input_len: 20.0 # second
   min_output_len: 0.0 # tokens
   max_output_len: 400.0 # tokens
   min_output_input_ratio: 0.05
   max_output_input_ratio: 10.0
+  
+collator:
+  mean_std_filepath: ""
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_202'
+  augmentation_config: conf/augmentation.json
+  batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
   specgram_type: fbank #linear, mfcc, fbank
   feat_dim: 80
diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml
index a6f73050..b40e77e3 100644
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@@ -3,18 +3,20 @@ data:
   train_manifest: data/manifest.tiny
   dev_manifest: data/manifest.tiny
   test_manifest: data/manifest.tiny
+  min_input_len: 0.5  # second
+  max_input_len: 20.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+  
+collator:
+  mean_std_filepath: ""
   vocab_filepath: data/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_200'
-  mean_std_filepath: ""
   augmentation_config: conf/augmentation.json
   batch_size: 4
-  min_input_len: 0.5
-  max_input_len: 20.0
-  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
   raw_wav: True  # use raw_wav or kaldi feature
   specgram_type: fbank #linear, mfcc, fbank
   feat_dim: 80
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index 71cbdde7..e97ad756 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -11,30 +11,29 @@ data:
   max_output_input_ratio: 10.0
   
 collator:
-  vocab_filepath: data/vocab.txt 
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
-  random_seed: 0
+  vocab_filepath: data/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
-  specgram_type: fbank
+  spm_model_prefix: 'data/bpe_unigram_202'
+  augmentation_config: conf/augmentation.json
+  batch_size: 4
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
-  stride_ms: 10.0
-  window_ms: 20.0
-  n_fft: None
-  max_freq: None
+  dither: 1.0
   target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
   use_dB_normalization: True
   target_dB: -20
-  dither: 1.0
+  random_seed: 0
   keep_transcription_text: False
-  batch_size: 4
   sortagrad: True 
   shuffle_method: batch_shuffle
-  num_workers: 0 #2
-  raw_wav: True  # use raw_wav or kaldi feature
-
+  num_workers: 2
 
 # network architecture
 model: