Merge pull request #756 from PaddlePaddle/filter

test w/ all example & fix ctc api & add new io
3 years ago · f05f367cc5
parent 25ba9359a5 0c4caa65d5
commit f05f367cc5
46 changed files with 2912 additions and 291 deletions
--- a/.notebook/espnet_dataloader.ipynb
+++ b/.notebook/espnet_dataloader.ipynb
--- a/.notebook/u2_confermer_model_wenet.ipynb
+++ b/.notebook/u2_confermer_model_wenet.ipynb
@ -3431,7 +3431,7 @@
    "        convolution_layer_args = (output_size, cnn_module_kernel, activation,\n",
    "                                  cnn_module_norm, causal)\n",
    "\n",
-    "        self.encoders = nn.ModuleList([\n",
+    "        self.encoders = nn.LayerList([\n",
    "            ConformerEncoderLayer(\n",
    "                size=output_size,\n",
    "                self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),\n",
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -30,24 +30,13 @@ logger = Log(__name__).getlog()
 logger.warn = logger.warning
 ########### hcak paddle #############
 paddle.bool = 'bool'
 paddle.float16 = 'float16'
 paddle.half = 'float16'
 paddle.float32 = 'float32'
 paddle.float = 'float32'
 paddle.float64 = 'float64'
 paddle.double = 'float64'
 paddle.int8 = 'int8'
 paddle.int16 = 'int16'
 paddle.short = 'int16'
 paddle.int32 = 'int32'
 paddle.int = 'int32'
 paddle.int64 = 'int64'
 paddle.long = 'int64'
 paddle.uint8 = 'uint8'
 paddle.uint16 = 'uint16'
 paddle.complex64 = 'complex64'
 paddle.complex128 = 'complex128'
 paddle.cdouble = 'complex128'
@ -403,45 +392,7 @@ if not hasattr(paddle.nn.functional, 'glu'):
 #         return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
 # hack loss
 def ctc_loss(logits,
             labels,
             input_lengths,
             label_lengths,
             blank=0,
             reduction='mean',
             norm_by_times=True):
    #logger.info("my ctc loss with norm by times")
    ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
    loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
                                           input_lengths, label_lengths)
    loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
    assert reduction in ['mean', 'sum', 'none']
    if reduction == 'mean':
        loss_out = paddle.mean(loss_out / label_lengths)
    elif reduction == 'sum':
        loss_out = paddle.sum(loss_out)
    return loss_out
 logger.warn(
    "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
 )
 F.ctc_loss = ctc_loss
 ########### hcak paddle.nn #############
 if not hasattr(paddle.nn, 'Module'):
    logger.warn("register user Module to paddle.nn, remove this when fixed!")
    setattr(paddle.nn, 'Module', paddle.nn.Layer)
 # maybe cause assert isinstance(sublayer, core.Layer)
 if not hasattr(paddle.nn, 'ModuleList'):
    logger.warn(
        "register user ModuleList to paddle.nn, remove this when fixed!")
    setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList)
 class GLU(nn.Layer):
    """Gated Linear Units (GLU) Layer"""
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -264,12 +264,12 @@ class U2Trainer(Trainer):
        config.data.manifest = config.data.test_manifest
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
-        # config.data.min_input_len = 0.0  # second
+        config.data.min_input_len = 0.0  # second
-        # config.data.max_input_len = float('inf')  # second
+        config.data.max_input_len = float('inf')  # second
-        # config.data.min_output_len = 0.0  # tokens
+        config.data.min_output_len = 0.0  # tokens
-        # config.data.max_output_len = float('inf')  # tokens
+        config.data.max_output_len = float('inf')  # tokens
-        # config.data.min_output_input_ratio = 0.00
+        config.data.min_output_input_ratio = 0.00
-        # config.data.max_output_input_ratio = float('inf')
+        config.data.max_output_input_ratio = float('inf')
        test_dataset = ManifestDataset.from_config(config)
        # return text ord id
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@ -13,18 +13,28 @@
 # limitations under the License.
 """Contains the data augmentation pipeline."""
 import json
 from collections.abc import Sequence
 from inspect import signature
 import numpy as np
-from deepspeech.frontend.augmentor.impulse_response import ImpulseResponseAugmentor
+from deepspeech.frontend.augmentor.base import AugmentorBase
-from deepspeech.frontend.augmentor.noise_perturb import NoisePerturbAugmentor
+from deepspeech.utils.dynamic_import import dynamic_import
-from deepspeech.frontend.augmentor.online_bayesian_normalization import \
+from deepspeech.utils.log import Log
-    OnlineBayesianNormalizationAugmentor
+
-from deepspeech.frontend.augmentor.resample import ResampleAugmentor
+__all__ = ["AugmentationPipeline"]
-from deepspeech.frontend.augmentor.shift_perturb import ShiftPerturbAugmentor
+
-from deepspeech.frontend.augmentor.spec_augment import SpecAugmentor
+logger = Log(__name__).getlog()
-from deepspeech.frontend.augmentor.speed_perturb import SpeedPerturbAugmentor
+
-from deepspeech.frontend.augmentor.volume_perturb import VolumePerturbAugmentor
+import_alias = dict(
    volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
    shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
    speed="deepspeech.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor",
    resample="deepspeech.frontend.augmentor.resample:ResampleAugmentor",
    bayesian_normal="deepspeech.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor",
    noise="deepspeech.frontend.augmentor.noise_perturb:NoisePerturbAugmentor",
    impulse="deepspeech.frontend.augmentor.impulse_response:ImpulseResponseAugmentor",
    specaug="deepspeech.frontend.augmentor.spec_augment:SpecAugmentor", )
 class AugmentationPipeline():
@ -78,20 +88,74 @@ class AugmentationPipeline():
    augmentor to take effect. If "prob" is zero, the augmentor does not take
    effect.
-    :param augmentation_config: Augmentation configuration in json string.
+    Params:
-    :type augmentation_config: str
+        augmentation_config(str): Augmentation configuration in json string.
-    :param random_seed: Random seed.
+        random_seed(int): Random seed.
-    :type random_seed: int
+        train(bool): whether is train mode.
-    :raises ValueError: If the augmentation json config is in incorrect format".
+    
    Raises:
        ValueError: If the augmentation json config is in incorrect format".
    """
-    def __init__(self, augmentation_config: str, random_seed=0):
+    def __init__(self, augmentation_config: str, random_seed: int=0):
        self._rng = np.random.RandomState(random_seed)
        self._spec_types = ('specaug')
-        self._augmentors, self._rates = self._parse_pipeline_from(
+
-            augmentation_config, 'audio')
+        if augmentation_config is None:
            self.conf = {}
        else:
            self.conf = json.loads(augmentation_config)
        self._augmentors, self._rates = self._parse_pipeline_from('all')
        self._audio_augmentors, self._audio_rates = self._parse_pipeline_from(
            'audio')
        self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
-            augmentation_config, 'feature')
+            'feature')
    def __call__(self, xs, uttid_list=None, **kwargs):
        if not isinstance(xs, Sequence):
            is_batch = False
            xs = [xs]
        else:
            is_batch = True
        if isinstance(uttid_list, str):
            uttid_list = [uttid_list for _ in range(len(xs))]
        if self.conf.get("mode", "sequential") == "sequential":
            for idx, (func, rate) in enumerate(
                    zip(self._augmentors, self._rates), 0):
                if self._rng.uniform(0., 1.) >= rate:
                    continue
                # Derive only the args which the func has
                try:
                    param = signature(func).parameters
                except ValueError:
                    # Some function, e.g. built-in function, are failed
                    param = {}
                _kwargs = {k: v for k, v in kwargs.items() if k in param}
                try:
                    if uttid_list is not None and "uttid" in param:
                        xs = [
                            func(x, u, **_kwargs)
                            for x, u in zip(xs, uttid_list)
                        ]
                    else:
                        xs = [func(x, **_kwargs) for x in xs]
                except Exception:
                    logger.fatal("Catch a exception from {}th func: {}".format(
                        idx, func))
                    raise
        else:
            raise NotImplementedError(
                "Not supporting mode={}".format(self.conf["mode"]))
        if is_batch:
            return xs
        else:
            return xs[0]
    def transform_audio(self, audio_segment):
        """Run the pre-processing pipeline for data augmentation.
@ -101,7 +165,7 @@ class AugmentationPipeline():
        :param audio_segment: Audio segment to process.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
-        for augmentor, rate in zip(self._augmentors, self._rates):
+        for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
            if self._rng.uniform(0., 1.) < rate:
                augmentor.transform_audio(audio_segment)
@ -116,14 +180,14 @@ class AugmentationPipeline():
                spec_segment = augmentor.transform_feature(spec_segment)
        return spec_segment
-    def _parse_pipeline_from(self, config_json, aug_type='audio'):
+    def _parse_pipeline_from(self, aug_type='all'):
        """Parse the config json to build a augmentation pipelien."""
-        assert aug_type in ('audio', 'feature'), aug_type
+        assert aug_type in ('audio', 'feature', 'all'), aug_type
        try:
            configs = json.loads(config_json)
        audio_confs = []
        feature_confs = []
-            for config in configs:
+        all_confs = []
        for config in self.conf:
            all_confs.append(config)
            if config["type"] in self._spec_types:
                feature_confs.append(config)
            else:
@ -133,35 +197,22 @@ class AugmentationPipeline():
            aug_confs = audio_confs
        elif aug_type == 'feature':
            aug_confs = feature_confs
        else:
            aug_confs = all_confs
        augmentors = [
            self._get_augmentor(config["type"], config["params"])
            for config in aug_confs
        ]
        rates = [config["prob"] for config in aug_confs]
        except Exception as e:
            raise ValueError("Failed to parse the augmentation config json: "
                             "%s" % str(e))
        return augmentors, rates
    def _get_augmentor(self, augmentor_type, params):
        """Return an augmentation model by the type name, and pass in params."""
-        if augmentor_type == "volume":
+        class_obj = dynamic_import(augmentor_type, import_alias)
-            return VolumePerturbAugmentor(self._rng, **params)
+        assert issubclass(class_obj, AugmentorBase)
-        elif augmentor_type == "shift":
+        try:
-            return ShiftPerturbAugmentor(self._rng, **params)
+            obj = class_obj(self._rng, **params)
-        elif augmentor_type == "speed":
+        except Exception:
            return SpeedPerturbAugmentor(self._rng, **params)
        elif augmentor_type == "resample":
            return ResampleAugmentor(self._rng, **params)
        elif augmentor_type == "bayesian_normal":
            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
        elif augmentor_type == "noise":
            return NoisePerturbAugmentor(self._rng, **params)
        elif augmentor_type == "impulse":
            return ImpulseResponseAugmentor(self._rng, **params)
        elif augmentor_type == "specaug":
            return SpecAugmentor(self._rng, **params)
        else:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
        return obj
--- a/deepspeech/frontend/augmentor/base.py
+++ b/deepspeech/frontend/augmentor/base.py
@ -28,6 +28,10 @@ class AugmentorBase():
    def __init__(self):
        pass
    @abstractmethod
    def __call__(self, xs):
        raise NotImplementedError
    @abstractmethod
    def transform_audio(self, audio_segment):
        """Adds various effects to the input audio segment. Such effects
--- a/deepspeech/frontend/augmentor/impulse_response.py
+++ b/deepspeech/frontend/augmentor/impulse_response.py
@ -30,6 +30,11 @@ class ImpulseResponseAugmentor(AugmentorBase):
        self._rng = rng
        self._impulse_manifest = read_manifest(impulse_manifest_path)
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return
        self.transform_audio(x)
    def transform_audio(self, audio_segment):
        """Add impulse response effect.
--- a/deepspeech/frontend/augmentor/noise_perturb.py
+++ b/deepspeech/frontend/augmentor/noise_perturb.py
@ -36,6 +36,11 @@ class NoisePerturbAugmentor(AugmentorBase):
        self._rng = rng
        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return
        self.transform_audio(x)
    def transform_audio(self, audio_segment):
        """Add background noise audio.
--- a/deepspeech/frontend/augmentor/online_bayesian_normalization.py
+++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py
@ -44,6 +44,11 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase):
        self._rng = rng
        self._startup_delay = startup_delay
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return
        self.transform_audio(x)
    def transform_audio(self, audio_segment):
        """Normalizes the input audio using the online Bayesian approach.
--- a/deepspeech/frontend/augmentor/resample.py
+++ b/deepspeech/frontend/augmentor/resample.py
@ -31,6 +31,11 @@ class ResampleAugmentor(AugmentorBase):
        self._new_sample_rate = new_sample_rate
        self._rng = rng
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return
        self.transform_audio(x)
    def transform_audio(self, audio_segment):
        """Resamples the input audio to a target sample rate.
--- a/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/deepspeech/frontend/augmentor/shift_perturb.py
@ -31,6 +31,11 @@ class ShiftPerturbAugmentor(AugmentorBase):
        self._max_shift_ms = max_shift_ms
        self._rng = rng
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return
        self.transform_audio(x)
    def transform_audio(self, audio_segment):
        """Shift audio.
--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@ -157,6 +157,11 @@ class SpecAugmentor(AugmentorBase):
            self._time_mask = (t_0, t_0 + t)
        return xs
    def __call__(self, x, train=True):
        if not train:
            return
        self.transform_audio(x)
    def transform_feature(self, xs: np.ndarray):
        """
        Args:
--- a/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/deepspeech/frontend/augmentor/speed_perturb.py
@ -79,6 +79,11 @@ class SpeedPerturbAugmentor(AugmentorBase):
            self._rates = np.linspace(
                self._min_rate, self._max_rate, self._num_rates, endpoint=True)
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return
        self.transform_audio(x)
    def transform_audio(self, audio_segment):
        """Sample a new speed rate from the given range and
        changes the speed of the given audio clip.
--- a/deepspeech/frontend/augmentor/volume_perturb.py
+++ b/deepspeech/frontend/augmentor/volume_perturb.py
@ -37,6 +37,11 @@ class VolumePerturbAugmentor(AugmentorBase):
        self._max_gain_dBFS = max_gain_dBFS
        self._rng = rng
    def __call__(self, x, uttid=None, train=True):
        if not train:
            return
        self.transform_audio(x)
    def transform_audio(self, audio_segment):
        """Change audio loadness.
--- a/deepspeech/io/init.py
+++ b/deepspeech/io/init.py
@ -11,139 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 from paddle.io import DataLoader
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 def create_dataloader(manifest_path,
                      unit_type,
                      vocab_filepath,
                      mean_std_filepath,
                      spm_model_prefix,
                      augmentation_config='{}',
                      max_input_len=float('inf'),
                      min_input_len=0.0,
                      max_output_len=float('inf'),
                      min_output_len=0.0,
                      max_output_input_ratio=float('inf'),
                      min_output_input_ratio=0.0,
                      stride_ms=10.0,
                      window_ms=20.0,
                      max_freq=None,
                      specgram_type='linear',
                      feat_dim=None,
                      delta_delta=False,
                      use_dB_normalization=True,
                      random_seed=0,
                      keep_transcription_text=False,
                      is_training=False,
                      batch_size=1,
                      num_workers=0,
                      sortagrad=False,
                      shuffle_method=None,
                      dist=False):
    dataset = ManifestDataset(
        manifest_path=manifest_path,
        unit_type=unit_type,
        vocab_filepath=vocab_filepath,
        mean_std_filepath=mean_std_filepath,
        spm_model_prefix=spm_model_prefix,
        augmentation_config=augmentation_config,
        max_input_len=max_input_len,
        min_input_len=min_input_len,
        max_output_len=max_output_len,
        min_output_len=min_output_len,
        max_output_input_ratio=max_output_input_ratio,
        min_output_input_ratio=min_output_input_ratio,
        stride_ms=stride_ms,
        window_ms=window_ms,
        max_freq=max_freq,
        specgram_type=specgram_type,
        feat_dim=feat_dim,
        delta_delta=delta_delta,
        use_dB_normalization=use_dB_normalization,
        random_seed=random_seed,
        keep_transcription_text=keep_transcription_text)
    if dist:
        batch_sampler = SortagradDistributedBatchSampler(
            dataset,
            batch_size,
            num_replicas=None,
            rank=None,
            shuffle=is_training,
            drop_last=is_training,
            sortagrad=is_training,
            shuffle_method=shuffle_method)
    else:
        batch_sampler = SortagradBatchSampler(
            dataset,
            shuffle=is_training,
            batch_size=batch_size,
            drop_last=is_training,
            sortagrad=is_training,
            shuffle_method=shuffle_method)
    def padding_batch(batch,
                      padding_to=-1,
                      flatten=False,
                      keep_transcription_text=True):
        """	
        Padding audio features with zeros to make them have the same shape (or	
        a user-defined shape) within one bach.	
        If ``padding_to`` is -1, the maximun shape in the batch will be used	
        as the target shape for padding. Otherwise, `padding_to` will be the	
        target shape (only refers to the second axis).	
        If `flatten` is True, features will be flatten to 1darray.	
        """
        new_batch = []
        # get target shape	
        max_length = max([audio.shape[1] for audio, text in batch])
        if padding_to != -1:
            if padding_to < max_length:
                raise ValueError("If padding_to is not -1, it should be larger "
                                 "than any instance's shape in the batch")
            max_length = padding_to
        max_text_length = max([len(text) for audio, text in batch])
        # padding	
        padded_audios = []
        audio_lens = []
        texts, text_lens = [], []
        for audio, text in batch:
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
            padded_audios.append(padded_audio)
            audio_lens.append(audio.shape[1])
            padded_text = np.zeros([max_text_length])
            if keep_transcription_text:
                padded_text[:len(text)] = [ord(t) for t in text]  # string
            else:
                padded_text[:len(text)] = text  # ids
            texts.append(padded_text)
            text_lens.append(len(text))
        padded_audios = np.array(padded_audios).astype('float32')
        audio_lens = np.array(audio_lens).astype('int64')
        texts = np.array(texts).astype('int32')
        text_lens = np.array(text_lens).astype('int64')
        return padded_audios, audio_lens, texts, text_lens
    # collate_fn=functools.partial(padding_batch, keep_transcription_text=keep_transcription_text),
    collate_fn = SpeechCollator(keep_transcription_text=keep_transcription_text)
    loader = DataLoader(
        dataset,
        batch_sampler=batch_sampler,
        collate_fn=collate_fn,
        num_workers=num_workers)
    return loader
--- a/deepspeech/io/batchfy.py
+++ b/deepspeech/io/batchfy.py
@ -0,0 +1,469 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import itertools
 import numpy as np
 from deepspeech.utils.log import Log
 __all__ = ["make_batchset"]
 logger = Log(__name__).getlog()
 def batchfy_by_seq(
        sorted_data,
        batch_size,
        max_length_in,
        max_length_out,
        min_batch_size=1,
        shortest_first=False,
        ikey="input",
        iaxis=0,
        okey="output",
        oaxis=0, ):
    """Make batch set from json dictionary
    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
    :param int batch_size: batch size
    :param int max_length_in: maximum length of input to decide adaptive batch size
    :param int max_length_out: maximum length of output to decide adaptive batch size
    :param int min_batch_size: mininum batch size (for multi-gpu)
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str ikey: key to access input
        (for ASR ikey="input", for TTS, MT ikey="output".)
    :param int iaxis: dimension to access input
        (for ASR, TTS iaxis=0, for MT iaxis="1".)
    :param str okey: key to access output
        (for ASR, MT okey="output". for TTS okey="input".)
    :param int oaxis: dimension to access output
        (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)
    :return: List[List[Tuple[str, dict]]] list of batches
    """
    if batch_size <= 0:
        raise ValueError(f"Invalid batch_size={batch_size}")
    # check #utts is more than min_batch_size
    if len(sorted_data) < min_batch_size:
        raise ValueError(
            f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})."
        )
    # make list of minibatches
    minibatches = []
    start = 0
    while True:
        _, info = sorted_data[start]
        ilen = int(info[ikey][iaxis]["shape"][0])
        olen = (int(info[okey][oaxis]["shape"][0]) if oaxis >= 0 else
                max(map(lambda x: int(x["shape"][0]), info[okey])))
        factor = max(int(ilen / max_length_in), int(olen / max_length_out))
        # change batchsize depending on the input and output length
        # if ilen = 1000 and max_length_in = 800
        # then b = batchsize / 2
        # and max(min_batches, .) avoids batchsize = 0
        bs = max(min_batch_size, int(batch_size / (1 + factor)))
        end = min(len(sorted_data), start + bs)
        minibatch = sorted_data[start:end]
        if shortest_first:
            minibatch.reverse()
        # check each batch is more than minimum batchsize
        if len(minibatch) < min_batch_size:
            mod = min_batch_size - len(minibatch) % min_batch_size
            additional_minibatch = [
                sorted_data[i] for i in np.random.randint(0, start, mod)
            ]
            if shortest_first:
                additional_minibatch.reverse()
            minibatch.extend(additional_minibatch)
        minibatches.append(minibatch)
        if end == len(sorted_data):
            break
        start = end
    # batch: List[List[Tuple[str, dict]]]
    return minibatches
 def batchfy_by_bin(
        sorted_data,
        batch_bins,
        num_batches=0,
        min_batch_size=1,
        shortest_first=False,
        ikey="input",
        okey="output", ):
    """Make variably sized batch set, which maximizes
    the number of bins up to `batch_bins`.
    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
    :param int batch_bins: Maximum frames of a batch
    :param int num_batches: # number of batches to use (for debug)
    :param int min_batch_size: minimum batch size (for multi-gpu)
    :param int test: Return only every `test` batches
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
    """
    if batch_bins <= 0:
        raise ValueError(f"invalid batch_bins={batch_bins}")
    length = len(sorted_data)
    idim = int(sorted_data[0][1][ikey][0]["shape"][1])
    odim = int(sorted_data[0][1][okey][0]["shape"][1])
    logger.info("# utts: " + str(len(sorted_data)))
    minibatches = []
    start = 0
    n = 0
    while True:
        # Dynamic batch size depending on size of samples
        b = 0
        next_size = 0
        max_olen = 0
        while next_size < batch_bins and (start + b) < length:
            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim
            olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim
            if olen > max_olen:
                max_olen = olen
            next_size = (max_olen + ilen) * (b + 1)
            if next_size <= batch_bins:
                b += 1
            elif next_size == 0:
                raise ValueError(
                    f"Can't fit one sample in batch_bins ({batch_bins}): "
                    f"Please increase the value")
        end = min(length, start + max(min_batch_size, b))
        batch = sorted_data[start:end]
        if shortest_first:
            batch.reverse()
        minibatches.append(batch)
        # Check for min_batch_size and fixes the batches if needed
        i = -1
        while len(minibatches[i]) < min_batch_size:
            missing = min_batch_size - len(minibatches[i])
            if -i == len(minibatches):
                minibatches[i + 1].extend(minibatches[i])
                minibatches = minibatches[1:]
                break
            else:
                minibatches[i].extend(minibatches[i - 1][:missing])
                minibatches[i - 1] = minibatches[i - 1][missing:]
                i -= 1
        if end == length:
            break
        start = end
        n += 1
    if num_batches > 0:
        minibatches = minibatches[:num_batches]
    lengths = [len(x) for x in minibatches]
    logger.info(
        str(len(minibatches)) + " batches containing from " + str(min(lengths))
        + " to " + str(max(lengths)) + " samples " + "(avg " + str(
            int(np.mean(lengths))) + " samples).")
    return minibatches
 def batchfy_by_frame(
        sorted_data,
        max_frames_in,
        max_frames_out,
        max_frames_inout,
        num_batches=0,
        min_batch_size=1,
        shortest_first=False,
        ikey="input",
        okey="output", ):
    """Make variable batch set, which maximizes the number of frames to max_batch_frame.
    :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json
    :param int max_frames_in: Maximum input frames of a batch
    :param int max_frames_out: Maximum output frames of a batch
    :param int max_frames_inout: Maximum input+output frames of a batch
    :param int num_batches: # number of batches to use (for debug)
    :param int min_batch_size: minimum batch size (for multi-gpu)
    :param int test: Return only every `test` batches
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
    """
    if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:
        raise ValueError(
            "At least, one of `--batch-frames-in`, `--batch-frames-out` or "
            "`--batch-frames-inout` should be > 0")
    length = len(sorted_data)
    minibatches = []
    start = 0
    end = 0
    while end != length:
        # Dynamic batch size depending on size of samples
        b = 0
        max_olen = 0
        max_ilen = 0
        while (start + b) < length:
            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0])
            if ilen > max_frames_in and max_frames_in != 0:
                raise ValueError(
                    f"Can't fit one sample in --batch-frames-in ({max_frames_in}): "
                    f"Please increase the value")
            olen = int(sorted_data[start + b][1][okey][0]["shape"][0])
            if olen > max_frames_out and max_frames_out != 0:
                raise ValueError(
                    f"Can't fit one sample in --batch-frames-out ({max_frames_out}): "
                    f"Please increase the value")
            if ilen + olen > max_frames_inout and max_frames_inout != 0:
                raise ValueError(
                    f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): "
                    f"Please increase the value")
            max_olen = max(max_olen, olen)
            max_ilen = max(max_ilen, ilen)
            in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0
            out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0
            inout_ok = (max_ilen + max_olen) * (
                b + 1) <= max_frames_inout or max_frames_inout == 0
            if in_ok and out_ok and inout_ok:
                # add more seq in the minibatch
                b += 1
            else:
                # no more seq in the minibatch
                break
        end = min(length, start + b)
        batch = sorted_data[start:end]
        if shortest_first:
            batch.reverse()
        minibatches.append(batch)
        # Check for min_batch_size and fixes the batches if needed
        i = -1
        while len(minibatches[i]) < min_batch_size:
            missing = min_batch_size - len(minibatches[i])
            if -i == len(minibatches):
                minibatches[i + 1].extend(minibatches[i])
                minibatches = minibatches[1:]
                break
            else:
                minibatches[i].extend(minibatches[i - 1][:missing])
                minibatches[i - 1] = minibatches[i - 1][missing:]
                i -= 1
        start = end
    if num_batches > 0:
        minibatches = minibatches[:num_batches]
    lengths = [len(x) for x in minibatches]
    logger.info(
        str(len(minibatches)) + " batches containing from " + str(min(lengths))
        + " to " + str(max(lengths)) + " samples" + "(avg " + str(
            int(np.mean(lengths))) + " samples).")
    return minibatches
 def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,
                    shortest_first):
    import random
    logger.info("use shuffled batch.")
    sorted_data = random.sample(data.items(), len(data.items()))
    logger.info("# utts: " + str(len(sorted_data)))
    # make list of minibatches
    minibatches = []
    start = 0
    while True:
        end = min(len(sorted_data), start + batch_size)
        # check each batch is more than minimum batchsize
        minibatch = sorted_data[start:end]
        if shortest_first:
            minibatch.reverse()
        if len(minibatch) < min_batch_size:
            mod = min_batch_size - len(minibatch) % min_batch_size
            additional_minibatch = [
                sorted_data[i] for i in np.random.randint(0, start, mod)
            ]
            if shortest_first:
                additional_minibatch.reverse()
            minibatch.extend(additional_minibatch)
        minibatches.append(minibatch)
        if end == len(sorted_data):
            break
        start = end
    # for debugging
    if num_batches > 0:
        minibatches = minibatches[:num_batches]
        logger.info("# minibatches: " + str(len(minibatches)))
    return minibatches
 BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"]
 BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"]
 def make_batchset(
        data,
        batch_size=0,
        max_length_in=float("inf"),
        max_length_out=float("inf"),
        num_batches=0,
        min_batch_size=1,
        shortest_first=False,
        batch_sort_key="input",
        count="auto",
        batch_bins=0,
        batch_frames_in=0,
        batch_frames_out=0,
        batch_frames_inout=0,
        iaxis=0,
        oaxis=0, ):
    """Make batch set from json dictionary
    if utts have "category" value,
        >>> data = [{'category': 'A', 'input': ..., 'utt':'utt1'},
        ...         {'category': 'B', 'input': ..., 'utt':'utt2'},
        ...         {'category': 'B', 'input': ..., 'utt':'utt3'},
        ...         {'category': 'A', 'input': ..., 'utt':'utt4'}]
        >>> make_batchset(data, batchsize=2, ...)
        [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]
    Note that if any utts doesn't have "category",
    perform as same as batchfy_by_{count}
    :param List[Dict[str, Any]] data: dictionary loaded from data.json
    :param int batch_size: maximum number of sequences in a minibatch.
    :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
    :param int batch_frames_in:  maximum number of input frames in a minibatch.
    :param int batch_frames_out: maximum number of output frames in a minibatch.
    :param int batch_frames_out: maximum number of input+output frames in a minibatch.
    :param str count: strategy to count maximum size of batch.
        For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES
    :param int max_length_in: maximum length of input to decide adaptive batch size
    :param int max_length_out: maximum length of output to decide adaptive batch size
    :param int num_batches: # number of batches to use (for debug)
    :param int min_batch_size: minimum batch size (for multi-gpu)
    :param bool shortest_first: Sort from batch with shortest samples
        to longest if true, otherwise reverse
    :param str batch_sort_key: how to sort data before creating minibatches
        ["input", "output", "shuffle"]
    :param bool swap_io: if True, use "input" as output and "output"
        as input in `data` dict
    :param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output"
        as input in `data` dict
    :param int iaxis: dimension to access input
        (for ASR, TTS iaxis=0, for MT iaxis="1".)
    :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,
        reserved for future research, -1 means all axis.)
    :return: List[List[Tuple[str, dict]]] list of batches
    """
    # check args
    if count not in BATCH_COUNT_CHOICES:
        raise ValueError(
            f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}")
    if batch_sort_key not in BATCH_SORT_KEY_CHOICES:
        raise ValueError(f"arg 'batch_sort_key' ({batch_sort_key}) should be "
                         f"one of {BATCH_SORT_KEY_CHOICES}")
    ikey = "input"
    okey = "output"
    batch_sort_axis = 0  # index of list 
    if count == "auto":
        if batch_size != 0:
            count = "seq"
        elif batch_bins != 0:
            count = "bin"
        elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:
            count = "frame"
        else:
            raise ValueError(
                f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}"
            )
        logger.info(f"count is auto detected as {count}")
    if count != "seq" and batch_sort_key == "shuffle":
        raise ValueError(
            "batch_sort_key=shuffle is only available if batch_count=seq")
    category2data = {}  # Dict[str, dict]
    for v in data:
        k = v['utt']
        category2data.setdefault(v.get("category"), {})[k] = v
    batches_list = []  # List[List[List[Tuple[str, dict]]]]
    for d in category2data.values():
        if batch_sort_key == "shuffle":
            batches = batchfy_shuffle(d, batch_size, min_batch_size,
                                      num_batches, shortest_first)
            batches_list.append(batches)
            continue
        # sort it by input lengths (long to short)
        sorted_data = sorted(
            d.items(),
            key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
            reverse=not shortest_first, )
        logger.info("# utts: " + str(len(sorted_data)))
        if count == "seq":
            batches = batchfy_by_seq(
                sorted_data,
                batch_size=batch_size,
                max_length_in=max_length_in,
                max_length_out=max_length_out,
                min_batch_size=min_batch_size,
                shortest_first=shortest_first,
                ikey=ikey,
                iaxis=iaxis,
                okey=okey,
                oaxis=oaxis, )
        if count == "bin":
            batches = batchfy_by_bin(
                sorted_data,
                batch_bins=batch_bins,
                min_batch_size=min_batch_size,
                shortest_first=shortest_first,
                ikey=ikey,
                okey=okey, )
        if count == "frame":
            batches = batchfy_by_frame(
                sorted_data,
                max_frames_in=batch_frames_in,
                max_frames_out=batch_frames_out,
                max_frames_inout=batch_frames_inout,
                min_batch_size=min_batch_size,
                shortest_first=shortest_first,
                ikey=ikey,
                okey=okey, )
        batches_list.append(batches)
    if len(batches_list) == 1:
        batches = batches_list[0]
    else:
        # Concat list. This way is faster than "sum(batch_list, [])"
        batches = list(itertools.chain(*batches_list))
    # for debugging
    if num_batches > 0:
        batches = batches[:num_batches]
    logger.info("# minibatches: " + str(len(batches)))
    # batch: List[List[Tuple[str, dict]]]
    return batches
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -23,7 +23,7 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
-from deepspeech.io.utility import pad_sequence
+from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log
 __all__ = ["SpeechCollator"]
@ -286,13 +286,12 @@ class SpeechCollator():
            texts.append(tokens)
            text_lens.append(tokens.shape[0])
-        padded_audios = pad_sequence(
+        #[B, T, D]
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)
-        audio_lens = np.array(audio_lens).astype(np.int64)
+        ilens = np.array(audio_lens).astype(np.int64)
-        padded_texts = pad_sequence(
+        ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64)
-            texts, padding_value=IGNORE_ID).astype(np.int64)
+        olens = np.array(text_lens).astype(np.int64)
-        text_lens = np.array(text_lens).astype(np.int64)
+        return utts, xs_pad, ilens, ys_pad, olens
        return utts, padded_audios, audio_lens, padded_texts, text_lens
    @property
    def manifest(self):
--- a/deepspeech/io/converter.py
+++ b/deepspeech/io/converter.py
@ -0,0 +1,80 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log
 __all__ = ["CustomConverter"]
 logger = Log(__name__).getlog()
 class CustomConverter():
    """Custom batch converter.
    Args:
        subsampling_factor (int): The subsampling factor.
        dtype (np.dtype): Data type to convert.
    """
    def __init__(self, subsampling_factor=1, dtype=np.float32):
        """Construct a CustomConverter object."""
        self.subsampling_factor = subsampling_factor
        self.ignore_id = -1
        self.dtype = dtype
    def __call__(self, batch):
        """Transform a batch and send it to a device.
        Args:
            batch (list): The batch to transform.
        Returns:
            tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
        """
        # batch should be located in list
        assert len(batch) == 1
        (xs, ys), utts = batch[0]
        # perform subsampling
        if self.subsampling_factor > 1:
            xs = [x[::self.subsampling_factor, :] for x in xs]
        # get batch of lengths of input sequences
        ilens = np.array([x.shape[0] for x in xs])
        # perform padding and convert to tensor
        # currently only support real number
        if xs[0].dtype.kind == "c":
            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
            # Note(kamo):
            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
            # Don't create ComplexTensor and give it E2E here
            # because torch.nn.DataParellel can't handle it.
            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
        else:
            xs_pad = pad_list(xs, 0).astype(self.dtype)
        # NOTE: this is for multi-output (e.g., speech translation)
        ys_pad = pad_list(
            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
            self.ignore_id)
        olens = np.array(
            [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
        return utts, xs_pad, ilens, ys_pad, olens
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
@ -0,0 +1,138 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.io import DataLoader
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.io.batchfy import make_batchset
 from deepspeech.io.converter import CustomConverter
 from deepspeech.io.dataset import TransformDataset
 from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.utils.log import Log
 __all__ = ["BatchDataLoader"]
 logger = Log(__name__).getlog()
 class BatchDataLoader():
    def __init__(self,
                 json_file: str,
                 train_mode: bool,
                 sortagrad: bool=False,
                 batch_size: int=0,
                 maxlen_in: float=float('inf'),
                 maxlen_out: float=float('inf'),
                 minibatches: int=0,
                 mini_batch_size: int=1,
                 batch_count: str='auto',
                 batch_bins: int=0,
                 batch_frames_in: int=0,
                 batch_frames_out: int=0,
                 batch_frames_inout: int=0,
                 preprocess_conf=None,
                 n_iter_processes: int=1,
                 subsampling_factor: int=1,
                 num_encs: int=1):
        self.json_file = json_file
        self.train_mode = train_mode
        self.use_sortagrad = sortagrad == -1 or sortagrad > 0
        self.batch_size = batch_size
        self.maxlen_in = maxlen_in
        self.maxlen_out = maxlen_out
        self.batch_count = batch_count
        self.batch_bins = batch_bins
        self.batch_frames_in = batch_frames_in
        self.batch_frames_out = batch_frames_out
        self.batch_frames_inout = batch_frames_inout
        self.subsampling_factor = subsampling_factor
        self.num_encs = num_encs
        self.preprocess_conf = preprocess_conf
        self.n_iter_processes = n_iter_processes
        # read json data
        self.data_json = read_manifest(json_file)
        # make minibatch list (variable length)
        self.minibaches = make_batchset(
            self.data_json,
            batch_size,
            maxlen_in,
            maxlen_out,
            minibatches,  # for debug
            min_batch_size=mini_batch_size,
            shortest_first=self.use_sortagrad,
            count=batch_count,
            batch_bins=batch_bins,
            batch_frames_in=batch_frames_in,
            batch_frames_out=batch_frames_out,
            batch_frames_inout=batch_frames_inout,
            iaxis=0,
            oaxis=0, )
        # data reader
        self.reader = LoadInputsAndTargets(
            mode="asr",
            load_output=True,
            preprocess_conf=preprocess_conf,
            preprocess_args={"train":
                             train_mode},  # Switch the mode of preprocessing
        )
        # Setup a converter
        if num_encs == 1:
            self.converter = CustomConverter(
                subsampling_factor=subsampling_factor, dtype=np.float32)
        else:
            assert NotImplementedError("not impl CustomConverterMulEnc.")
        # hack to make batchsize argument as 1
        # actual bathsize is included in a list
        # default collate function converts numpy array to pytorch tensor
        # we used an empty collate function instead which returns list
        self.dataset = TransformDataset(
            self.minibaches,
            lambda data: self.converter([self.reader(data, return_uttid=True)]))
        self.dataloader = DataLoader(
            dataset=self.dataset,
            batch_size=1,
            shuffle=not use_sortagrad if train_mode else False,
            collate_fn=lambda x: x[0],
            num_workers=n_iter_processes, )
    def __repr__(self):
        echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
        echo += f"train_mode: {self.train_mode}, "
        echo += f"sortagrad: {self.use_sortagrad}, "
        echo += f"batch_size: {self.batch_size}, "
        echo += f"maxlen_in: {self.maxlen_in}, "
        echo += f"maxlen_out: {self.maxlen_out}, "
        echo += f"batch_count: {self.batch_count}, "
        echo += f"batch_bins: {self.batch_bins}, "
        echo += f"batch_frames_in: {self.batch_frames_in}, "
        echo += f"batch_frames_out: {self.batch_frames_out}, "
        echo += f"batch_frames_inout: {self.batch_frames_inout}, "
        echo += f"subsampling_factor: {self.subsampling_factor}, "
        echo += f"num_encs: {self.num_encs}, "
        echo += f"num_workers: {self.n_iter_processes}, "
        echo += f"file: {self.json_file}"
        return echo
    def __len__(self):
        return len(self.dataloader)
    def __iter__(self):
        return self.dataloader.__iter__()
    def __call__(self):
        return self.__iter__()
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
-__all__ = ["ManifestDataset", "TripletManifestDataset"]
+__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
 logger = Log(__name__).getlog()
@ -76,12 +76,18 @@ class ManifestDataset(Dataset):
        Args:
            manifest_path (str): manifest josn file path
-            max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
+            max_input_len ([type], optional): maximum output seq length, 
-            min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
+                in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
-            max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
+            min_input_len (float, optional): minimum input seq length, 
-            min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
+                in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
-            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
+            max_output_len (float, optional): maximum input seq length, 
-            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
+                in modeling units. Defaults to 500.0.
            min_output_len (float, optional): minimum input seq length, 
                in modeling units. Defaults to 0.0.
            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. 
                Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
                Defaults to 0.05.
        """
        super().__init__()
@ -116,3 +122,27 @@ class TripletManifestDataset(ManifestDataset):
        instance = self._manifest[idx]
        return instance["utt"], instance["feat"], instance["text"], instance[
            "text1"]
 class TransformDataset(Dataset):
    """Transform Dataset.
    Args:
        data: list object from make_batchset
        transfrom: transform function
    """
    def __init__(self, data, transform):
        """Init function."""
        super().__init__()
        self.data = data
        self.transform = transform
    def __len__(self):
        """Len function."""
        return len(self.data)
    def __getitem__(self, idx):
        """[] operator."""
        return self.transform(self.data[idx])
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@ -0,0 +1,409 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import OrderedDict
 import kaldiio
 import numpy as np
 import soundfile
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.utils.log import Log
 __all__ = ["LoadInputsAndTargets"]
 logger = Log(__name__).getlog()
 class LoadInputsAndTargets():
    """Create a mini-batch from a list of dicts
    >>> batch = [('utt1',
    ...           dict(input=[dict(feat='some.ark:123',
    ...                            filetype='mat',
    ...                            name='input1',
    ...                            shape=[100, 80])],
    ...                output=[dict(tokenid='1 2 3 4',
    ...                             name='target1',
    ...                             shape=[4, 31])]]))
    >>> l = LoadInputsAndTargets()
    >>> feat, target = l(batch)
    :param: str mode: Specify the task mode, "asr" or "tts"
    :param: str preprocess_conf: The path of a json file for pre-processing
    :param: bool load_input: If False, not to load the input data
    :param: bool load_output: If False, not to load the output data
    :param: bool sort_in_input_length: Sort the mini-batch in descending order
        of the input length
    :param: bool use_speaker_embedding: Used for tts mode only
    :param: bool use_second_target: Used for tts mode only
    :param: dict preprocess_args: Set some optional arguments for preprocessing
    :param: Optional[dict] preprocess_args: Used for tts mode only
    """
    def __init__(
            self,
            mode="asr",
            preprocess_conf=None,
            load_input=True,
            load_output=True,
            sort_in_input_length=True,
            preprocess_args=None,
            keep_all_data_on_mem=False, ):
        self._loaders = {}
        if mode not in ["asr"]:
            raise ValueError("Only asr are allowed: mode={}".format(mode))
        if preprocess_conf is not None:
            self.preprocessing = AugmentationPipeline(preprocess_conf)
            logging.warning(
                "[Experimental feature] Some preprocessing will be done "
                "for the mini-batch creation using {}".format(
                    self.preprocessing))
        else:
            # If conf doesn't exist, this function don't touch anything.
            self.preprocessing = None
        self.mode = mode
        self.load_output = load_output
        self.load_input = load_input
        self.sort_in_input_length = sort_in_input_length
        if preprocess_args is None:
            self.preprocess_args = {}
        else:
            assert isinstance(preprocess_args, dict), type(preprocess_args)
            self.preprocess_args = dict(preprocess_args)
        self.keep_all_data_on_mem = keep_all_data_on_mem
    def __call__(self, batch, return_uttid=False):
        """Function to load inputs and targets from list of dicts
        :param List[Tuple[str, dict]] batch: list of dict which is subset of
            loaded data.json
        :param bool return_uttid: return utterance ID information for visualization
        :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
        :return: list of input feature sequences
            [(T_1, D), (T_2, D), ..., (T_B, D)]
        :rtype: list of float ndarray
        :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
        :rtype: list of int ndarray
        """
        x_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
        y_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
        uttid_list = []  # List[str]
        for uttid, info in batch:
            uttid_list.append(uttid)
            if self.load_input:
                # Note(kamo): This for-loop is for multiple inputs
                for idx, inp in enumerate(info["input"]):
                    # {"input":
                    #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
                    #    "filetype": "hdf5",
                    #    "name": "input1", ...}], ...}
                    x = self._get_from_loader(
                        filepath=inp["feat"],
                        filetype=inp.get("filetype", "mat"))
                    x_feats_dict.setdefault(inp["name"], []).append(x)
            if self.load_output:
                for idx, inp in enumerate(info["output"]):
                    if "tokenid" in inp:
                        # ======= Legacy format for output =======
                        # {"output": [{"tokenid": "1 2 3 4"}])
                        x = np.fromiter(
                            map(int, inp["tokenid"].split()), dtype=np.int64)
                    else:
                        # ======= New format =======
                        # {"input":
                        #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
                        #    "filetype": "hdf5",
                        #    "name": "target1", ...}], ...}
                        x = self._get_from_loader(
                            filepath=inp["feat"],
                            filetype=inp.get("filetype", "mat"))
                    y_feats_dict.setdefault(inp["name"], []).append(x)
        if self.mode == "asr":
            return_batch, uttid_list = self._create_batch_asr(
                x_feats_dict, y_feats_dict, uttid_list)
        else:
            raise NotImplementedError(self.mode)
        if self.preprocessing is not None:
            # Apply pre-processing all input features
            for x_name in return_batch.keys():
                if x_name.startswith("input"):
                    return_batch[x_name] = self.preprocessing(
                        return_batch[x_name], uttid_list,
                        **self.preprocess_args)
        if return_uttid:
            return tuple(return_batch.values()), uttid_list
        # Doesn't return the names now.
        return tuple(return_batch.values())
    def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
        """Create a OrderedDict for the mini-batch
        :param OrderedDict x_feats_dict:
            e.g. {"input1": [ndarray, ndarray, ...],
                  "input2": [ndarray, ndarray, ...]}
        :param OrderedDict y_feats_dict:
            e.g. {"target1": [ndarray, ndarray, ...],
                  "target2": [ndarray, ndarray, ...]}
        :param: List[str] uttid_list:
            Give uttid_list to sort in the same order as the mini-batch
        :return: batch, uttid_list
        :rtype: Tuple[OrderedDict, List[str]]
        """
        # handle single-input and multi-input (paralell) asr mode
        xs = list(x_feats_dict.values())
        if self.load_output:
            ys = list(y_feats_dict.values())
            assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))
            # get index of non-zero length samples
            nonzero_idx = list(
                filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))
            for n in range(1, len(y_feats_dict)):
                nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)
        else:
            # Note(kamo): Be careful not to make nonzero_idx to a generator
            nonzero_idx = list(range(len(xs[0])))
        if self.sort_in_input_length:
            # sort in input lengths based on the first input
            nonzero_sorted_idx = sorted(
                nonzero_idx, key=lambda i: -len(xs[0][i]))
        else:
            nonzero_sorted_idx = nonzero_idx
        if len(nonzero_sorted_idx) != len(xs[0]):
            logging.warning(
                "Target sequences include empty tokenid (batch {} -> {}).".
                format(len(xs[0]), len(nonzero_sorted_idx)))
        # remove zero-length samples
        xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]
        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]
        x_names = list(x_feats_dict.keys())
        if self.load_output:
            ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]
            y_names = list(y_feats_dict.keys())
            # Keeping x_name and y_name, e.g. input1, for future extension
            return_batch = OrderedDict([
                * [(x_name, x) for x_name, x in zip(x_names, xs)],
                * [(y_name, y) for y_name, y in zip(y_names, ys)],
            ])
        else:
            return_batch = OrderedDict(
                [(x_name, x) for x_name, x in zip(x_names, xs)])
        return return_batch, uttid_list
    def _get_from_loader(self, filepath, filetype):
        """Return ndarray
        In order to make the fds to be opened only at the first referring,
        the loader are stored in self._loaders
        >>> ndarray = loader.get_from_loader(
        ...     'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
        :param: str filepath:
        :param: str filetype:
        :return:
        :rtype: np.ndarray
        """
        if filetype == "hdf5":
            # e.g.
            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
            #                "filetype": "hdf5",
            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = h5py.File(filepath, "r")
                self._loaders[filepath] = loader
            return loader[key][()]
        elif filetype == "sound.hdf5":
            # e.g.
            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
            #                "filetype": "sound.hdf5",
            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = SoundHDF5File(filepath, "r", dtype="int16")
                self._loaders[filepath] = loader
            array, rate = loader[key]
            return array
        elif filetype == "sound":
            # e.g.
            #    {"input": [{"feat": "some/path.wav",
            #                "filetype": "sound"},
            # Assume PCM16
            if not self.keep_all_data_on_mem:
                array, _ = soundfile.read(filepath, dtype="int16")
                return array
            if filepath not in self._loaders:
                array, _ = soundfile.read(filepath, dtype="int16")
                self._loaders[filepath] = array
            return self._loaders[filepath]
        elif filetype == "npz":
            # e.g.
            #    {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
            #                "filetype": "npz",
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = np.load(filepath)
                self._loaders[filepath] = loader
            return loader[key]
        elif filetype == "npy":
            # e.g.
            #    {"input": [{"feat": "some/path.npy",
            #                "filetype": "npy"},
            if not self.keep_all_data_on_mem:
                return np.load(filepath)
            if filepath not in self._loaders:
                self._loaders[filepath] = np.load(filepath)
            return self._loaders[filepath]
        elif filetype in ["mat", "vec"]:
            # e.g.
            #    {"input": [{"feat": "some/path.ark:123",
            #                "filetype": "mat"}]},
            # In this case, "123" indicates the starting points of the matrix
            # load_mat can load both matrix and vector
            if not self.keep_all_data_on_mem:
                return kaldiio.load_mat(filepath)
            if filepath not in self._loaders:
                self._loaders[filepath] = kaldiio.load_mat(filepath)
            return self._loaders[filepath]
        elif filetype == "scp":
            # e.g.
            #    {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
            #                "filetype": "scp",
            filepath, key = filepath.split(":", 1)
            loader = self._loaders.get(filepath)
            if loader is None:
                # To avoid disk access, create loader only for the first time
                loader = kaldiio.load_scp(filepath)
                self._loaders[filepath] = loader
            return loader[key]
        else:
            raise NotImplementedError(
                "Not supported: loader_type={}".format(filetype))
 class SoundHDF5File():
    """Collecting sound files to a HDF5 file
    >>> f = SoundHDF5File('a.flac.h5', mode='a')
    >>> array = np.random.randint(0, 100, 100, dtype=np.int16)
    >>> f['id'] = (array, 16000)
    >>> array, rate = f['id']
    :param: str filepath:
    :param: str mode:
    :param: str format: The type used when saving wav. flac, nist, htk, etc.
    :param: str dtype:
    """
    def __init__(self,
                 filepath,
                 mode="r+",
                 format=None,
                 dtype="int16",
                 **kwargs):
        self.filepath = filepath
        self.mode = mode
        self.dtype = dtype
        self.file = h5py.File(filepath, mode, **kwargs)
        if format is None:
            # filepath = a.flac.h5 -> format = flac
            second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1]
            format = second_ext[1:]
            if format.upper() not in soundfile.available_formats():
                # If not found, flac is selected
                format = "flac"
        # This format affects only saving
        self.format = format
    def __repr__(self):
        return '<SoundHDF5 file "{}" (mode {}, format {}, type {})>'.format(
            self.filepath, self.mode, self.format, self.dtype)
    def create_dataset(self, name, shape=None, data=None, **kwds):
        f = io.BytesIO()
        array, rate = data
        soundfile.write(f, array, rate, format=self.format)
        self.file.create_dataset(
            name, shape=shape, data=np.void(f.getvalue()), **kwds)
    def __setitem__(self, name, data):
        self.create_dataset(name, data=data)
    def __getitem__(self, key):
        data = self.file[key][()]
        f = io.BytesIO(data.tobytes())
        array, rate = soundfile.read(f, dtype=self.dtype)
        return array, rate
    def keys(self):
        return self.file.keys()
    def values(self):
        for k in self.file:
            yield self[k]
    def items(self):
        for k in self.file:
            yield k, self[k]
    def __iter__(self):
        return iter(self.file)
    def __contains__(self, item):
        return item in self.file
    def __len__(self, item):
        return len(self.file)
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.file.close()
    def close(self):
        self.file.close()
--- a/deepspeech/io/utility.py
+++ b/deepspeech/io/utility.py
@ -17,11 +17,16 @@ import numpy as np
 from deepspeech.utils.log import Log
-__all__ = ["pad_sequence"]
+__all__ = ["pad_list", "pad_sequence"]
 logger = Log(__name__).getlog()
 def pad_list(sequences: List[np.ndarray],
             padding_value: float=0.0) -> np.ndarray:
    return pad_sequence(sequences, True, padding_value)
 def pad_sequence(sequences: List[np.ndarray],
                 batch_first: bool=True,
                 padding_value: float=0.0) -> np.ndarray:
--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
                        share_weights=share_rnn_weights))
            i_size = h_size * 2
-        self.rnn_stacks = nn.ModuleList(rnn_stacks)
+        self.rnn_stacks = nn.LayerList(rnn_stacks)
    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        """
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@ -54,7 +54,7 @@ __all__ = ["U2Model", "U2InferModel"]
 logger = Log(__name__).getlog()
-class U2BaseModel(nn.Module):
+class U2BaseModel(nn.Layer):
    """CTC-Attention hybrid Encoder-Decoder model"""
    @classmethod
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@ -48,7 +48,7 @@ __all__ = ["U2STModel", "U2STInferModel"]
 logger = Log(__name__).getlog()
-class U2STBaseModel(nn.Module):
+class U2STBaseModel(nn.Layer):
    """CTC-Attention hybrid Encoder-Decoder model"""
    @classmethod
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@ -33,7 +33,7 @@ logger = Log(__name__).getlog()
 __all__ = ["TransformerDecoder"]
-class TransformerDecoder(nn.Module):
+class TransformerDecoder(nn.Layer):
    """Base class of Transfomer decoder module.
    Args:
        vocab_size: output dim
@ -86,7 +86,7 @@ class TransformerDecoder(nn.Module):
        self.use_output_layer = use_output_layer
        self.output_layer = nn.Linear(attention_dim, vocab_size)
-        self.decoders = nn.ModuleList([
+        self.decoders = nn.LayerList([
            DecoderLayer(
                size=attention_dim,
                self_attn=MultiHeadedAttention(attention_heads, attention_dim,
--- a/deepspeech/modules/decoder_layer.py
+++ b/deepspeech/modules/decoder_layer.py
@ -25,15 +25,15 @@ logger = Log(__name__).getlog()
 __all__ = ["DecoderLayer"]
-class DecoderLayer(nn.Module):
+class DecoderLayer(nn.Layer):
    """Single decoder layer module.
    Args:
        size (int): Input dimension.
-        self_attn (nn.Module): Self-attention module instance.
+        self_attn (nn.Layer): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
-        src_attn (nn.Module): Self-attention module instance.
+        src_attn (nn.Layer): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
-        feed_forward (nn.Module): Feed-forward module instance.
+        feed_forward (nn.Layer): Feed-forward module instance.
            `PositionwiseFeedForward` instance can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool):
@ -48,9 +48,9 @@ class DecoderLayer(nn.Module):
    def __init__(
            self,
            size: int,
-            self_attn: nn.Module,
+            self_attn: nn.Layer,
-            src_attn: nn.Module,
+            src_attn: nn.Layer,
-            feed_forward: nn.Module,
+            feed_forward: nn.Layer,
            dropout_rate: float,
            normalize_before: bool=True,
            concat_after: bool=False, ):
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@ -358,7 +358,7 @@ class TransformerEncoder(BaseEncoder):
                         pos_enc_layer_type, normalize_before, concat_after,
                         static_chunk_size, use_dynamic_chunk, global_cmvn,
                         use_dynamic_left_chunk)
-        self.encoders = nn.ModuleList([
+        self.encoders = nn.LayerList([
            TransformerEncoderLayer(
                size=output_size,
                self_attn=MultiHeadedAttention(attention_heads, output_size,
@ -438,7 +438,7 @@ class ConformerEncoder(BaseEncoder):
        convolution_layer_args = (output_size, cnn_module_kernel, activation,
                                  cnn_module_norm, causal)
-        self.encoders = nn.ModuleList([
+        self.encoders = nn.LayerList([
            ConformerEncoderLayer(
                size=output_size,
                self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -48,7 +48,8 @@ class CTCLoss(nn.Layer):
        logits = logits.transpose([1, 0, 2])
        # (TODO:Hui Zhang) ctc loss does not support int64 labels
        ys_pad = ys_pad.astype(paddle.int32)
-        loss = self.loss(logits, ys_pad, hlens, ys_lens)
+        loss = self.loss(
            logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
        if self.batch_average:
            # Batch-size average
            loss = loss / B
--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
                        share_weights=share_rnn_weights))
            i_size = h_size * 2
-        self.rnn_stacks = nn.ModuleList(rnn_stacks)
+        self.rnn_stacks = nn.LayerList(rnn_stacks)
    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        """
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
@ -21,7 +21,6 @@
 | --- | --- | --- | --- | --- | --- | --- | --- |
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 |  
 ## Chunk Conformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
@ -39,4 +38,7 @@
 ### Test w/o length filter
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 7.63 | 0.056832 |  
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | ctc_greedy_search | 7.63 | 0.059742 |  
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | ctc_prefix_beam_search | 7.63 | 0.059057 |  
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention_rescoring | 7.63 | 0.047417 |  
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@ -4,7 +4,7 @@ data:
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test-clean
  min_input_len: 0.5  # second
-  max_input_len: 20.0 # second
+  max_input_len: 30.0 # second
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
@ -5,7 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
-avg_num=30
+avg_num=5
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 avg_ckpt=avg_${avg_num}
--- a/examples/librispeech/s2/local/espnet_json_to_manifest.py
+++ b/examples/librispeech/s2/local/espnet_json_to_manifest.py
@ -0,0 +1,36 @@
 #!/usr/bin/env python
 import argparse
 import json
 def main(args):
    with open(args.json_file, 'r') as fin:
        data_json = json.load(fin)
    # manifest format:
    # {"input": [
    #       {"feat": "dev/deltafalse/feats.1.ark:842920", "name": "input1", "shape": [349, 83]}
    #  ], 
    #  "output": [
    #       {"name": "target1", "shape": [12, 5002], "text": "NO APOLLO", "token": "▁NO ▁A PO LL O", "tokenid": "3144 482 352 269 317"}
    #  ], 
    #  "utt2spk": "116-288045", 
    #  "utt": "116-288045-0019"}
    with open(args.manifest_file, 'w') as fout:
        for key, value in data_json['utts'].items():
            value['utt'] = key
            fout.write(json.dumps(value, ensure_ascii=False))
            fout.write("\n")
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '--json-file', type=str, default=None, help="espnet data json file.")
    parser.add_argument(
        '--manifest-file',
        type=str,
        default='maniefst.train',
        help='manifest data json line file.')
    args = parser.parse_args()
    main(args)
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
@ -5,7 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
-avg_num=30
+avg_num=5
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 avg_ckpt=avg_${avg_num}
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 coverage
 gpustat
 kaldiio
 pre-commit
 pybind11
 resampy==0.2.2
--- a/speechnn/examples/CMakeLists.txt
+++ b/speechnn/examples/CMakeLists.txt
--- a/speechnn/core/frontend/CMakeLists.txt
+++ b/speechnn/core/frontend/CMakeLists.txt
--- a/speechnn/speechnn/decoder/CMakeLists.txt
+++ b/speechnn/speechnn/decoder/CMakeLists.txt
--- a/speechnn/core/frontend/audio/CMakeLists.txt
+++ b/speechnn/core/frontend/audio/CMakeLists.txt
--- a/speechnn/speechnn/frontend/audio/CMakeLists.txt
+++ b/speechnn/speechnn/frontend/audio/CMakeLists.txt
--- a/speechnn/speechnn/frontend/text/CMakeLists.txt
+++ b/speechnn/speechnn/frontend/text/CMakeLists.txt
--- a/speechnn/speechnn/model/CMakeLists.txt
+++ b/speechnn/speechnn/model/CMakeLists.txt
--- a/speechnn/speechnn/nn/CMakeLists.txt
+++ b/speechnn/speechnn/nn/CMakeLists.txt
--- a/speechnn/speechnn/protocol/CMakeLists.txt
+++ b/speechnn/speechnn/protocol/CMakeLists.txt
--- a/speechnn/speechnn/utils/CMakeLists.txt
+++ b/speechnn/speechnn/utils/CMakeLists.txt