From a48f61c6f1faed1ece87d514bce21c87f89b56da Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 16 Aug 2021 03:35:34 +0000
Subject: [PATCH 01/17] test w/ all example

---
 deepspeech/exps/u2/model.py                   | 12 ++++++------
 examples/librispeech/s1/README.md             |  2 +-
 examples/librispeech/s1/conf/transformer.yaml |  2 +-
 examples/librispeech/s1/run.sh                |  2 +-
 examples/tiny/s1/conf/transformer.yaml        |  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index d661f078d..0662e38d9 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -264,12 +264,12 @@ class U2Trainer(Trainer):
         config.data.manifest = config.data.test_manifest
         # filter test examples, will cause less examples, but no mismatch with training
         # and can use large batch size , save training time, so filter test egs now.
-        # config.data.min_input_len = 0.0  # second
-        # config.data.max_input_len = float('inf')  # second
-        # config.data.min_output_len = 0.0  # tokens
-        # config.data.max_output_len = float('inf')  # tokens
-        # config.data.min_output_input_ratio = 0.00
-        # config.data.max_output_input_ratio = float('inf')
+        config.data.min_input_len = 0.0  # second
+        config.data.max_input_len = float('inf')  # second
+        config.data.min_output_len = 0.0  # tokens
+        config.data.max_output_len = float('inf')  # tokens
+        config.data.min_output_input_ratio = 0.00
+        config.data.max_output_input_ratio = float('inf')
 
         test_dataset = ManifestDataset.from_config(config)
         # return text ord id
diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md
index daa4d175b..2dd508664 100644
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
@@ -21,7 +21,6 @@
 | --- | --- | --- | --- | --- | --- | --- | --- |
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 |  
 
-
 ## Chunk Conformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
@@ -40,3 +39,4 @@
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 7.63 | 0.056832 |  
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml
index 261886770..ba8ccc827 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -4,7 +4,7 @@ data:
   dev_manifest: data/manifest.dev
   test_manifest: data/manifest.test-clean
   min_input_len: 0.5  # second
-  max_input_len: 20.0 # second
+  max_input_len: 30.0 # second
   min_output_len: 0.0 # tokens
   max_output_len: 400.0 # tokens
   min_output_input_ratio: 0.05
diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh
index 2a8f2e2d1..def10ab05 100755
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
@@ -5,7 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
-avg_num=30
+avg_num=5
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
 avg_ckpt=avg_${avg_num}
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index e97ad7565..fd5adbdee 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -14,7 +14,7 @@ collator:
   mean_std_filepath: ""
   vocab_filepath: data/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_202'
+  spm_model_prefix: 'data/bpe_unigram_200'
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature

From df0b9ead2552e26eb1987b4cb1679da7b89f424a Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 16 Aug 2021 06:24:26 +0000
Subject: [PATCH 02/17] more result

---
 examples/librispeech/s1/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md
index 2dd508664..4cb3629de 100644
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
@@ -38,5 +38,7 @@
 ### Test w/o length filter
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 |  
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 7.63 | 0.056832 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | ctc_greedy_search | 7.63 | 0.059742 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | ctc_prefix_beam_search | 7.63 | 0.059057 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention_rescoring | 7.63 | 0.047417 |  

From 32883dca4b0c93b6edb79bf2354e7048c1d293a6 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 16 Aug 2021 08:32:51 +0000
Subject: [PATCH 03/17] add batchfy

---
 deepspeech/io/batchfy.py                      | 470 ++++++++++++++++++
 .../s2/local/espnet_json_to_manifest.py       |  36 ++
 examples/librispeech/s2/run.sh                |   2 +-
 3 files changed, 507 insertions(+), 1 deletion(-)
 create mode 100644 deepspeech/io/batchfy.py
 create mode 100755 examples/librispeech/s2/local/espnet_json_to_manifest.py

diff --git a/deepspeech/io/batchfy.py b/deepspeech/io/batchfy.py
new file mode 100644
index 000000000..31fa2392b
--- /dev/null
+++ b/deepspeech/io/batchfy.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+
+import logger
+import numpy as np
+
+from deepspeech.utils.log import Log
+
+__all__ = ["make_batchset"]
+
+logger = Log(__name__).getlog()
+
+
+def batchfy_by_seq(
+        sorted_data,
+        batch_size,
+        max_length_in,
+        max_length_out,
+        min_batch_size=1,
+        shortest_first=False,
+        ikey="input",
+        iaxis=0,
+        okey="output",
+        oaxis=0, ):
+    """Make batch set from json dictionary
+
+    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
+    :param int batch_size: batch size
+    :param int max_length_in: maximum length of input to decide adaptive batch size
+    :param int max_length_out: maximum length of output to decide adaptive batch size
+    :param int min_batch_size: mininum batch size (for multi-gpu)
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+    :param str ikey: key to access input
+        (for ASR ikey="input", for TTS, MT ikey="output".)
+    :param int iaxis: dimension to access input
+        (for ASR, TTS iaxis=0, for MT iaxis="1".)
+    :param str okey: key to access output
+        (for ASR, MT okey="output". for TTS okey="input".)
+    :param int oaxis: dimension to access output
+        (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)
+    :return: List[List[Tuple[str, dict]]] list of batches
+    """
+    if batch_size <= 0:
+        raise ValueError(f"Invalid batch_size={batch_size}")
+
+    # check #utts is more than min_batch_size
+    if len(sorted_data) < min_batch_size:
+        raise ValueError(
+            f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})."
+        )
+
+    # make list of minibatches
+    minibatches = []
+    start = 0
+    while True:
+        _, info = sorted_data[start]
+        ilen = int(info[ikey][iaxis]["shape"][0])
+        olen = (int(info[okey][oaxis]["shape"][0]) if oaxis >= 0 else
+                max(map(lambda x: int(x["shape"][0]), info[okey])))
+        factor = max(int(ilen / max_length_in), int(olen / max_length_out))
+        # change batchsize depending on the input and output length
+        # if ilen = 1000 and max_length_in = 800
+        # then b = batchsize / 2
+        # and max(min_batches, .) avoids batchsize = 0
+        bs = max(min_batch_size, int(batch_size / (1 + factor)))
+        end = min(len(sorted_data), start + bs)
+        minibatch = sorted_data[start:end]
+        if shortest_first:
+            minibatch.reverse()
+
+        # check each batch is more than minimum batchsize
+        if len(minibatch) < min_batch_size:
+            mod = min_batch_size - len(minibatch) % min_batch_size
+            additional_minibatch = [
+                sorted_data[i] for i in np.random.randint(0, start, mod)
+            ]
+            if shortest_first:
+                additional_minibatch.reverse()
+            minibatch.extend(additional_minibatch)
+        minibatches.append(minibatch)
+
+        if end == len(sorted_data):
+            break
+        start = end
+
+    # batch: List[List[Tuple[str, dict]]]
+    return minibatches
+
+
+def batchfy_by_bin(
+        sorted_data,
+        batch_bins,
+        num_batches=0,
+        min_batch_size=1,
+        shortest_first=False,
+        ikey="input",
+        okey="output", ):
+    """Make variably sized batch set, which maximizes
+
+    the number of bins up to `batch_bins`.
+
+    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
+    :param int batch_bins: Maximum frames of a batch
+    :param int num_batches: # number of batches to use (for debug)
+    :param int min_batch_size: minimum batch size (for multi-gpu)
+    :param int test: Return only every `test` batches
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+
+    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
+    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
+
+    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
+    """
+    if batch_bins <= 0:
+        raise ValueError(f"invalid batch_bins={batch_bins}")
+    length = len(sorted_data)
+    idim = int(sorted_data[0][1][ikey][0]["shape"][1])
+    odim = int(sorted_data[0][1][okey][0]["shape"][1])
+    logger.info("# utts: " + str(len(sorted_data)))
+    minibatches = []
+    start = 0
+    n = 0
+    while True:
+        # Dynamic batch size depending on size of samples
+        b = 0
+        next_size = 0
+        max_olen = 0
+        while next_size < batch_bins and (start + b) < length:
+            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim
+            olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim
+            if olen > max_olen:
+                max_olen = olen
+            next_size = (max_olen + ilen) * (b + 1)
+            if next_size <= batch_bins:
+                b += 1
+            elif next_size == 0:
+                raise ValueError(
+                    f"Can't fit one sample in batch_bins ({batch_bins}): "
+                    f"Please increase the value")
+        end = min(length, start + max(min_batch_size, b))
+        batch = sorted_data[start:end]
+        if shortest_first:
+            batch.reverse()
+        minibatches.append(batch)
+        # Check for min_batch_size and fixes the batches if needed
+        i = -1
+        while len(minibatches[i]) < min_batch_size:
+            missing = min_batch_size - len(minibatches[i])
+            if -i == len(minibatches):
+                minibatches[i + 1].extend(minibatches[i])
+                minibatches = minibatches[1:]
+                break
+            else:
+                minibatches[i].extend(minibatches[i - 1][:missing])
+                minibatches[i - 1] = minibatches[i - 1][missing:]
+                i -= 1
+        if end == length:
+            break
+        start = end
+        n += 1
+    if num_batches > 0:
+        minibatches = minibatches[:num_batches]
+    lengths = [len(x) for x in minibatches]
+    logger.info(
+        str(len(minibatches)) + " batches containing from " + str(min(lengths))
+        + " to " + str(max(lengths)) + " samples " + "(avg " + str(
+            int(np.mean(lengths))) + " samples).")
+    return minibatches
+
+
+def batchfy_by_frame(
+        sorted_data,
+        max_frames_in,
+        max_frames_out,
+        max_frames_inout,
+        num_batches=0,
+        min_batch_size=1,
+        shortest_first=False,
+        ikey="input",
+        okey="output", ):
+    """Make variable batch set, which maximizes the number of frames to max_batch_frame.
+
+    :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json
+    :param int max_frames_in: Maximum input frames of a batch
+    :param int max_frames_out: Maximum output frames of a batch
+    :param int max_frames_inout: Maximum input+output frames of a batch
+    :param int num_batches: # number of batches to use (for debug)
+    :param int min_batch_size: minimum batch size (for multi-gpu)
+    :param int test: Return only every `test` batches
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+
+    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
+    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
+
+    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
+    """
+    if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:
+        raise ValueError(
+            "At least, one of `--batch-frames-in`, `--batch-frames-out` or "
+            "`--batch-frames-inout` should be > 0")
+    length = len(sorted_data)
+    minibatches = []
+    start = 0
+    end = 0
+    while end != length:
+        # Dynamic batch size depending on size of samples
+        b = 0
+        max_olen = 0
+        max_ilen = 0
+        while (start + b) < length:
+            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0])
+            if ilen > max_frames_in and max_frames_in != 0:
+                raise ValueError(
+                    f"Can't fit one sample in --batch-frames-in ({max_frames_in}): "
+                    f"Please increase the value")
+            olen = int(sorted_data[start + b][1][okey][0]["shape"][0])
+            if olen > max_frames_out and max_frames_out != 0:
+                raise ValueError(
+                    f"Can't fit one sample in --batch-frames-out ({max_frames_out}): "
+                    f"Please increase the value")
+            if ilen + olen > max_frames_inout and max_frames_inout != 0:
+                raise ValueError(
+                    f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): "
+                    f"Please increase the value")
+            max_olen = max(max_olen, olen)
+            max_ilen = max(max_ilen, ilen)
+            in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0
+            out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0
+            inout_ok = (max_ilen + max_olen) * (
+                b + 1) <= max_frames_inout or max_frames_inout == 0
+            if in_ok and out_ok and inout_ok:
+                # add more seq in the minibatch
+                b += 1
+            else:
+                # no more seq in the minibatch
+                break
+        end = min(length, start + b)
+        batch = sorted_data[start:end]
+        if shortest_first:
+            batch.reverse()
+        minibatches.append(batch)
+        # Check for min_batch_size and fixes the batches if needed
+        i = -1
+        while len(minibatches[i]) < min_batch_size:
+            missing = min_batch_size - len(minibatches[i])
+            if -i == len(minibatches):
+                minibatches[i + 1].extend(minibatches[i])
+                minibatches = minibatches[1:]
+                break
+            else:
+                minibatches[i].extend(minibatches[i - 1][:missing])
+                minibatches[i - 1] = minibatches[i - 1][missing:]
+                i -= 1
+        start = end
+    if num_batches > 0:
+        minibatches = minibatches[:num_batches]
+    lengths = [len(x) for x in minibatches]
+    logger.info(
+        str(len(minibatches)) + " batches containing from " + str(min(lengths))
+        + " to " + str(max(lengths)) + " samples" + "(avg " + str(
+            int(np.mean(lengths))) + " samples).")
+
+    return minibatches
+
+
+def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,
+                    shortest_first):
+    import random
+
+    logger.info("use shuffled batch.")
+    sorted_data = random.sample(data.items(), len(data.items()))
+    logger.info("# utts: " + str(len(sorted_data)))
+    # make list of minibatches
+    minibatches = []
+    start = 0
+    while True:
+        end = min(len(sorted_data), start + batch_size)
+        # check each batch is more than minimum batchsize
+        minibatch = sorted_data[start:end]
+        if shortest_first:
+            minibatch.reverse()
+        if len(minibatch) < min_batch_size:
+            mod = min_batch_size - len(minibatch) % min_batch_size
+            additional_minibatch = [
+                sorted_data[i] for i in np.random.randint(0, start, mod)
+            ]
+            if shortest_first:
+                additional_minibatch.reverse()
+            minibatch.extend(additional_minibatch)
+        minibatches.append(minibatch)
+        if end == len(sorted_data):
+            break
+        start = end
+
+    # for debugging
+    if num_batches > 0:
+        minibatches = minibatches[:num_batches]
+        logger.info("# minibatches: " + str(len(minibatches)))
+    return minibatches
+
+
+BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"]
+BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"]
+
+
+def make_batchset(
+        data,
+        batch_size=0,
+        max_length_in=float("inf"),
+        max_length_out=float("inf"),
+        num_batches=0,
+        min_batch_size=1,
+        shortest_first=False,
+        batch_sort_key="input",
+        count="auto",
+        batch_bins=0,
+        batch_frames_in=0,
+        batch_frames_out=0,
+        batch_frames_inout=0,
+        iaxis=0,
+        oaxis=0, ):
+    """Make batch set from json dictionary
+
+    if utts have "category" value,
+
+        >>> data = {'utt1': {'category': 'A', 'input': ...},
+        ...         'utt2': {'category': 'B', 'input': ...},
+        ...         'utt3': {'category': 'B', 'input': ...},
+        ...         'utt4': {'category': 'A', 'input': ...}}
+        >>> make_batchset(data, batchsize=2, ...)
+        [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]
+
+    Note that if any utts doesn't have "category",
+    perform as same as batchfy_by_{count}
+
+    :param Dict[str, Dict[str, Any]] data: dictionary loaded from data.json
+    :param int batch_size: maximum number of sequences in a minibatch.
+    :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
+    :param int batch_frames_in:  maximum number of input frames in a minibatch.
+    :param int batch_frames_out: maximum number of output frames in a minibatch.
+    :param int batch_frames_out: maximum number of input+output frames in a minibatch.
+    :param str count: strategy to count maximum size of batch.
+        For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES
+
+    :param int max_length_in: maximum length of input to decide adaptive batch size
+    :param int max_length_out: maximum length of output to decide adaptive batch size
+    :param int num_batches: # number of batches to use (for debug)
+    :param int min_batch_size: minimum batch size (for multi-gpu)
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+    :param str batch_sort_key: how to sort data before creating minibatches
+        ["input", "output", "shuffle"]
+    :param bool swap_io: if True, use "input" as output and "output"
+        as input in `data` dict
+    :param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output"
+        as input in `data` dict
+    :param int iaxis: dimension to access input
+        (for ASR, TTS iaxis=0, for MT iaxis="1".)
+    :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,
+        reserved for future research, -1 means all axis.)
+    :return: List[List[Tuple[str, dict]]] list of batches
+    """
+
+    # check args
+    if count not in BATCH_COUNT_CHOICES:
+        raise ValueError(
+            f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}")
+    if batch_sort_key not in BATCH_SORT_KEY_CHOICES:
+        raise ValueError(f"arg 'batch_sort_key' ({batch_sort_key}) should be "
+                         f"one of {BATCH_SORT_KEY_CHOICES}")
+
+    ikey = "input"
+    okey = "output"
+    batch_sort_axis = 0  # index of list 
+
+    if count == "auto":
+        if batch_size != 0:
+            count = "seq"
+        elif batch_bins != 0:
+            count = "bin"
+        elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:
+            count = "frame"
+        else:
+            raise ValueError(
+                f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}"
+            )
+        logger.info(f"count is auto detected as {count}")
+
+    if count != "seq" and batch_sort_key == "shuffle":
+        raise ValueError(
+            "batch_sort_key=shuffle is only available if batch_count=seq")
+
+    category2data = {}  # Dict[str, dict]
+    for k, v in data.items():
+        category2data.setdefault(v.get("category"), {})[k] = v
+
+    batches_list = []  # List[List[List[Tuple[str, dict]]]]
+    for d in category2data.values():
+        if batch_sort_key == "shuffle":
+            batches = batchfy_shuffle(d, batch_size, min_batch_size,
+                                      num_batches, shortest_first)
+            batches_list.append(batches)
+            continue
+
+        # sort it by input lengths (long to short)
+        sorted_data = sorted(
+            d.items(),
+            key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
+            reverse=not shortest_first, )
+        logger.info("# utts: " + str(len(sorted_data)))
+        if count == "seq":
+            batches = batchfy_by_seq(
+                sorted_data,
+                batch_size=batch_size,
+                max_length_in=max_length_in,
+                max_length_out=max_length_out,
+                min_batch_size=min_batch_size,
+                shortest_first=shortest_first,
+                ikey=ikey,
+                iaxis=iaxis,
+                okey=okey,
+                oaxis=oaxis, )
+        if count == "bin":
+            batches = batchfy_by_bin(
+                sorted_data,
+                batch_bins=batch_bins,
+                min_batch_size=min_batch_size,
+                shortest_first=shortest_first,
+                ikey=ikey,
+                okey=okey, )
+        if count == "frame":
+            batches = batchfy_by_frame(
+                sorted_data,
+                max_frames_in=batch_frames_in,
+                max_frames_out=batch_frames_out,
+                max_frames_inout=batch_frames_inout,
+                min_batch_size=min_batch_size,
+                shortest_first=shortest_first,
+                ikey=ikey,
+                okey=okey, )
+        batches_list.append(batches)
+
+    if len(batches_list) == 1:
+        batches = batches_list[0]
+    else:
+        # Concat list. This way is faster than "sum(batch_list, [])"
+        batches = list(itertools.chain(*batches_list))
+
+    # for debugging
+    if num_batches > 0:
+        batches = batches[:num_batches]
+    logger.info("# minibatches: " + str(len(batches)))
+
+    # batch: List[List[Tuple[str, dict]]]
+    return batches
diff --git a/examples/librispeech/s2/local/espnet_json_to_manifest.py b/examples/librispeech/s2/local/espnet_json_to_manifest.py
new file mode 100755
index 000000000..acfa46681
--- /dev/null
+++ b/examples/librispeech/s2/local/espnet_json_to_manifest.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+import argparse
+import json
+
+
+def main(args):
+    with open(args.json_file, 'r') as fin:
+        data_json = json.load(fin)
+
+    # manifest format:
+    # {"input": [
+    #       {"feat": "dev/deltafalse/feats.1.ark:842920", "name": "input1", "shape": [349, 83]}
+    #  ], 
+    #  "output": [
+    #       {"name": "target1", "shape": [12, 5002], "text": "NO APOLLO", "token": "▁NO ▁A PO LL O", "tokenid": "3144 482 352 269 317"}
+    #  ], 
+    #  "utt2spk": "116-288045", 
+    #  "utt": "116-288045-0019"}
+    with open(args.manifest_file, 'w') as fout:
+        for key, value in data_json['utts'].items():
+            value['utt'] = key
+            fout.write(json.dumps(value, ensure_ascii=False))
+            fout.write("\n")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--json-file', type=str, default=None, help="espnet data json file.")
+    parser.add_argument(
+        '--manifest-file',
+        type=str,
+        default='maniefst.train',
+        help='manifest data json line file.')
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh
index 2a8f2e2d1..def10ab05 100755
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
@@ -5,7 +5,7 @@ source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
-avg_num=30
+avg_num=5
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
 avg_ckpt=avg_${avg_num}

From 5fbced8b52ad602bd6572b4216e2e5ec086d6abb Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 16 Aug 2021 11:44:48 +0000
Subject: [PATCH 04/17] more data utils

---
 deepspeech/io/batchfy.py    |   1 -
 deepspeech/io/dataloader.py | 177 +++++++++++++++++++++
 deepspeech/io/dataset.py    |  26 ++-
 deepspeech/io/utility.py    | 305 +++++++++++++++++++++++++++++++++++-
 4 files changed, 506 insertions(+), 3 deletions(-)
 create mode 100644 deepspeech/io/dataloader.py

diff --git a/deepspeech/io/batchfy.py b/deepspeech/io/batchfy.py
index 31fa2392b..d237eb749 100644
--- a/deepspeech/io/batchfy.py
+++ b/deepspeech/io/batchfy.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import itertools
 
-import logger
 import numpy as np
 
 from deepspeech.utils.log import Log
diff --git a/deepspeech/io/dataloader.py b/deepspeech/io/dataloader.py
new file mode 100644
index 000000000..0c5034caa
--- /dev/null
+++ b/deepspeech/io/dataloader.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.io import DataLoader
+
+from deepspeech.frontend.utility import read_manifest
+from deepspeech.io.batchfy import make_batchset
+from deepspeech.io.dataset import TransformDataset
+from deepspeech.io.utility import LoadInputsAndTargets
+from deepspeech.io.utility import pad_list
+from deepspeech.utils.log import Log
+
+__all__ = ["CustomConverter", "BatchDataLoader"]
+
+logger = Log(__name__).getlog()
+
+
+class CustomConverter():
+    """Custom batch converter.
+
+    Args:
+        subsampling_factor (int): The subsampling factor.
+        dtype (paddle.dtype): Data type to convert.
+
+    """
+
+    def __init__(self, subsampling_factor=1, dtype=paddle.float32):
+        """Construct a CustomConverter object."""
+        self.subsampling_factor = subsampling_factor
+        self.ignore_id = -1
+        self.dtype = dtype
+
+    def __call__(self, batch):
+        """Transform a batch and send it to a device.
+
+        Args:
+            batch (list): The batch to transform.
+
+        Returns:
+            tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
+
+        """
+        # batch should be located in list
+        assert len(batch) == 1
+        xs, ys = batch[0]
+
+        # perform subsampling
+        if self.subsampling_factor > 1:
+            xs = [x[::self.subsampling_factor, :] for x in xs]
+
+        # get batch of lengths of input sequences
+        ilens = np.array([x.shape[0] for x in xs])
+
+        # perform padding and convert to tensor
+        # currently only support real number
+        if xs[0].dtype.kind == "c":
+            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
+            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
+            # Note(kamo):
+            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
+            # Don't create ComplexTensor and give it E2E here
+            # because torch.nn.DataParellel can't handle it.
+            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
+        else:
+            xs_pad = pad_list(xs, 0).astype(self.dtype)
+
+        ilens = paddle.to_tensor(ilens)
+
+        # NOTE: this is for multi-output (e.g., speech translation)
+        ys_pad = pad_list(
+            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
+            self.ignore_id)
+
+        olens = np.array([y.shape[0] for y in ys])
+        return xs_pad, ilens, ys_pad, olens
+
+
+class BatchDataLoader():
+    def __init__(self,
+                 json_file: str,
+                 train_mode: bool,
+                 sortagrad: bool=False,
+                 batch_size: int=0,
+                 maxlen_in: float=float('inf'),
+                 maxlen_out: float=float('inf'),
+                 minibatches: int=0,
+                 mini_batch_size: int=1,
+                 batch_count: str='auto',
+                 batch_bins: int=0,
+                 batch_frames_in: int=0,
+                 batch_frames_out: int=0,
+                 batch_frames_inout: int=0,
+                 preprocess_conf=None,
+                 n_iter_processes: int=1,
+                 subsampling_factor: int=1,
+                 num_encs: int=1):
+        self.json_file = json_file
+        self.train_mode = train_mode
+
+        self.use_sortagrad = sortagrad == -1 or sortagrad > 0
+        self.batch_size = batch_size
+        self.maxlen_in = maxlen_in
+        self.maxlen_out = maxlen_out
+        self.batch_count = batch_count
+        self.batch_bins = batch_bins
+        self.batch_frames_in = batch_frames_in
+        self.batch_frames_out = batch_frames_out
+        self.batch_frames_inout = batch_frames_inout
+
+        self.subsampling_factor = subsampling_factor
+        self.num_encs = num_encs
+        self.preprocess_conf = preprocess_conf
+
+        self.n_iter_processes = n_iter_processes
+
+        # read json data
+        data_json = read_manifest(json_file)
+        logger.info(f"load {json_file} file.")
+
+        # make minibatch list (variable length)
+        self.data = make_batchset(
+            data_json,
+            batch_size,
+            maxlen_in,
+            maxlen_out,
+            minibatches,  # for debug
+            min_batch_size=mini_batch_size,
+            shortest_first=self.use_sortagrad,
+            count=batch_count,
+            batch_bins=batch_bins,
+            batch_frames_in=batch_frames_in,
+            batch_frames_out=batch_frames_out,
+            batch_frames_inout=batch_frames_inout,
+            iaxis=0,
+            oaxis=0, )
+        logger.info(f"batchfy data {json_file}: {len(self.data)}.")
+
+        self.load = LoadInputsAndTargets(
+            mode="asr",
+            load_output=True,
+            preprocess_conf=preprocess_conf,
+            preprocess_args={"train":
+                             train_mode},  # Switch the mode of preprocessing
+        )
+
+        # Setup a converter
+        if num_encs == 1:
+            self.converter = CustomConverter(
+                subsampling_factor=subsampling_factor, dtype=dtype)
+        else:
+            assert NotImplementedError("not impl CustomConverterMulEnc.")
+
+        # hack to make batchsize argument as 1
+        # actual bathsize is included in a list
+        # default collate function converts numpy array to pytorch tensor
+        # we used an empty collate function instead which returns list
+        self.train_loader = DataLoader(
+            dataset=TransformDataset(
+                self.data, lambda data: self.converter([self.load(data)])),
+            batch_size=1,
+            shuffle=not use_sortagrad if train_mode else False,
+            collate_fn=lambda x: x[0],
+            num_workers=n_iter_processes, )
+        logger.info(f"dataloader for {json_file}.")
+
+    def __repr__(self):
+        return f"DataLoader {self.json_file}-{self.train_mode}-{self.use_sortagrad}"
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index ac7be1f9e..e2db93404 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
 
-__all__ = ["ManifestDataset", "TripletManifestDataset"]
+__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
 
 logger = Log(__name__).getlog()
 
@@ -116,3 +116,27 @@ class TripletManifestDataset(ManifestDataset):
         instance = self._manifest[idx]
         return instance["utt"], instance["feat"], instance["text"], instance[
             "text1"]
+
+
+class TransformDataset(Dataset):
+    """Transform Dataset.
+
+    Args:
+        data: list object from make_batchset
+        transfrom: transform function
+
+    """
+
+    def __init__(self, data, transform):
+        """Init function."""
+        super().__init__()
+        self.data = data
+        self.transform = transform
+
+    def __len__(self):
+        """Len function."""
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        """[] operator."""
+        return self.transform(self.data[idx])
diff --git a/deepspeech/io/utility.py b/deepspeech/io/utility.py
index 0cd37428b..915813f3a 100644
--- a/deepspeech/io/utility.py
+++ b/deepspeech/io/utility.py
@@ -11,17 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections import OrderedDict
 from typing import List
 
 import numpy as np
 
+from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.utils.log import Log
 
-__all__ = ["pad_sequence"]
+__all__ = ["pad_list", "pad_sequence", "LoadInputsAndTargets"]
 
 logger = Log(__name__).getlog()
 
 
+def pad_list(sequences: List[np.ndarray],
+             padding_value: float=0.0) -> np.ndarray:
+    return pad_sequence(sequences, True, padding_value)
+
+
 def pad_sequence(sequences: List[np.ndarray],
                  batch_first: bool=True,
                  padding_value: float=0.0) -> np.ndarray:
@@ -80,3 +87,299 @@ def pad_sequence(sequences: List[np.ndarray],
             out_tensor[:length, i, ...] = tensor
 
     return out_tensor
+
+
+class LoadInputsAndTargets():
+    """Create a mini-batch from a list of dicts
+
+    >>> batch = [('utt1',
+    ...           dict(input=[dict(feat='some.ark:123',
+    ...                            filetype='mat',
+    ...                            name='input1',
+    ...                            shape=[100, 80])],
+    ...                output=[dict(tokenid='1 2 3 4',
+    ...                             name='target1',
+    ...                             shape=[4, 31])]]))
+    >>> l = LoadInputsAndTargets()
+    >>> feat, target = l(batch)
+
+    :param: str mode: Specify the task mode, "asr" or "tts"
+    :param: str preprocess_conf: The path of a json file for pre-processing
+    :param: bool load_input: If False, not to load the input data
+    :param: bool load_output: If False, not to load the output data
+    :param: bool sort_in_input_length: Sort the mini-batch in descending order
+        of the input length
+    :param: bool use_speaker_embedding: Used for tts mode only
+    :param: bool use_second_target: Used for tts mode only
+    :param: dict preprocess_args: Set some optional arguments for preprocessing
+    :param: Optional[dict] preprocess_args: Used for tts mode only
+    """
+
+    def __init__(
+            self,
+            mode="asr",
+            preprocess_conf=None,
+            load_input=True,
+            load_output=True,
+            sort_in_input_length=True,
+            preprocess_args=None,
+            keep_all_data_on_mem=False, ):
+        self._loaders = {}
+
+        if mode not in ["asr"]:
+            raise ValueError("Only asr are allowed: mode={}".format(mode))
+
+        if preprocess_conf is not None:
+            self.preprocessing = AugmentationPipeline(preprocess_conf)
+            logging.warning(
+                "[Experimental feature] Some preprocessing will be done "
+                "for the mini-batch creation using {}".format(
+                    self.preprocessing))
+        else:
+            # If conf doesn't exist, this function don't touch anything.
+            self.preprocessing = None
+
+        self.mode = mode
+        self.load_output = load_output
+        self.load_input = load_input
+        self.sort_in_input_length = sort_in_input_length
+        if preprocess_args is None:
+            self.preprocess_args = {}
+        else:
+            assert isinstance(preprocess_args, dict), type(preprocess_args)
+            self.preprocess_args = dict(preprocess_args)
+
+        self.keep_all_data_on_mem = keep_all_data_on_mem
+
+    def __call__(self, batch, return_uttid=False):
+        """Function to load inputs and targets from list of dicts
+
+        :param List[Tuple[str, dict]] batch: list of dict which is subset of
+            loaded data.json
+        :param bool return_uttid: return utterance ID information for visualization
+        :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
+        :return: list of input feature sequences
+            [(T_1, D), (T_2, D), ..., (T_B, D)]
+        :rtype: list of float ndarray
+        :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
+        :rtype: list of int ndarray
+
+        """
+        x_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
+        y_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
+        uttid_list = []  # List[str]
+
+        for uttid, info in batch:
+            uttid_list.append(uttid)
+
+            if self.load_input:
+                # Note(kamo): This for-loop is for multiple inputs
+                for idx, inp in enumerate(info["input"]):
+                    # {"input":
+                    #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+                    #    "filetype": "hdf5",
+                    #    "name": "input1", ...}], ...}
+                    x = self._get_from_loader(
+                        filepath=inp["feat"],
+                        filetype=inp.get("filetype", "mat"))
+                    x_feats_dict.setdefault(inp["name"], []).append(x)
+
+            if self.load_output:
+                for idx, inp in enumerate(info["output"]):
+                    if "tokenid" in inp:
+                        # ======= Legacy format for output =======
+                        # {"output": [{"tokenid": "1 2 3 4"}])
+                        x = np.fromiter(
+                            map(int, inp["tokenid"].split()), dtype=np.int64)
+                    else:
+                        # ======= New format =======
+                        # {"input":
+                        #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+                        #    "filetype": "hdf5",
+                        #    "name": "target1", ...}], ...}
+                        x = self._get_from_loader(
+                            filepath=inp["feat"],
+                            filetype=inp.get("filetype", "mat"))
+
+                    y_feats_dict.setdefault(inp["name"], []).append(x)
+
+        if self.mode == "asr":
+            return_batch, uttid_list = self._create_batch_asr(
+                x_feats_dict, y_feats_dict, uttid_list)
+        else:
+            raise NotImplementedError(self.mode)
+
+        if self.preprocessing is not None:
+            # Apply pre-processing all input features
+            for x_name in return_batch.keys():
+                if x_name.startswith("input"):
+                    return_batch[x_name] = self.preprocessing(
+                        return_batch[x_name], uttid_list,
+                        **self.preprocess_args)
+
+        if return_uttid:
+            return tuple(return_batch.values()), uttid_list
+
+        # Doesn't return the names now.
+        return tuple(return_batch.values())
+
+    def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
+        """Create a OrderedDict for the mini-batch
+
+        :param OrderedDict x_feats_dict:
+            e.g. {"input1": [ndarray, ndarray, ...],
+                  "input2": [ndarray, ndarray, ...]}
+        :param OrderedDict y_feats_dict:
+            e.g. {"target1": [ndarray, ndarray, ...],
+                  "target2": [ndarray, ndarray, ...]}
+        :param: List[str] uttid_list:
+            Give uttid_list to sort in the same order as the mini-batch
+        :return: batch, uttid_list
+        :rtype: Tuple[OrderedDict, List[str]]
+        """
+        # handle single-input and multi-input (paralell) asr mode
+        xs = list(x_feats_dict.values())
+
+        if self.load_output:
+            ys = list(y_feats_dict.values())
+            assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))
+
+            # get index of non-zero length samples
+            nonzero_idx = list(
+                filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))
+            for n in range(1, len(y_feats_dict)):
+                nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)
+        else:
+            # Note(kamo): Be careful not to make nonzero_idx to a generator
+            nonzero_idx = list(range(len(xs[0])))
+
+        if self.sort_in_input_length:
+            # sort in input lengths based on the first input
+            nonzero_sorted_idx = sorted(
+                nonzero_idx, key=lambda i: -len(xs[0][i]))
+        else:
+            nonzero_sorted_idx = nonzero_idx
+
+        if len(nonzero_sorted_idx) != len(xs[0]):
+            logging.warning(
+                "Target sequences include empty tokenid (batch {} -> {}).".
+                format(len(xs[0]), len(nonzero_sorted_idx)))
+
+        # remove zero-length samples
+        xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]
+        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]
+
+        x_names = list(x_feats_dict.keys())
+        if self.load_output:
+            ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]
+            y_names = list(y_feats_dict.keys())
+
+            # Keeping x_name and y_name, e.g. input1, for future extension
+            return_batch = OrderedDict([
+                * [(x_name, x) for x_name, x in zip(x_names, xs)],
+                * [(y_name, y) for y_name, y in zip(y_names, ys)],
+            ])
+        else:
+            return_batch = OrderedDict(
+                [(x_name, x) for x_name, x in zip(x_names, xs)])
+        return return_batch, uttid_list
+
+    def _get_from_loader(self, filepath, filetype):
+        """Return ndarray
+
+        In order to make the fds to be opened only at the first referring,
+        the loader are stored in self._loaders
+
+        >>> ndarray = loader.get_from_loader(
+        ...     'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
+
+        :param: str filepath:
+        :param: str filetype:
+        :return:
+        :rtype: np.ndarray
+        """
+        if filetype == "hdf5":
+            # e.g.
+            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+            #                "filetype": "hdf5",
+            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
+            filepath, key = filepath.split(":", 1)
+
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = h5py.File(filepath, "r")
+                self._loaders[filepath] = loader
+            return loader[key][()]
+        elif filetype == "sound.hdf5":
+            # e.g.
+            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+            #                "filetype": "sound.hdf5",
+            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
+            filepath, key = filepath.split(":", 1)
+
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = SoundHDF5File(filepath, "r", dtype="int16")
+                self._loaders[filepath] = loader
+            array, rate = loader[key]
+            return array
+        elif filetype == "sound":
+            # e.g.
+            #    {"input": [{"feat": "some/path.wav",
+            #                "filetype": "sound"},
+            # Assume PCM16
+            if not self.keep_all_data_on_mem:
+                array, _ = soundfile.read(filepath, dtype="int16")
+                return array
+            if filepath not in self._loaders:
+                array, _ = soundfile.read(filepath, dtype="int16")
+                self._loaders[filepath] = array
+            return self._loaders[filepath]
+        elif filetype == "npz":
+            # e.g.
+            #    {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
+            #                "filetype": "npz",
+            filepath, key = filepath.split(":", 1)
+
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = np.load(filepath)
+                self._loaders[filepath] = loader
+            return loader[key]
+        elif filetype == "npy":
+            # e.g.
+            #    {"input": [{"feat": "some/path.npy",
+            #                "filetype": "npy"},
+            if not self.keep_all_data_on_mem:
+                return np.load(filepath)
+            if filepath not in self._loaders:
+                self._loaders[filepath] = np.load(filepath)
+            return self._loaders[filepath]
+        elif filetype in ["mat", "vec"]:
+            # e.g.
+            #    {"input": [{"feat": "some/path.ark:123",
+            #                "filetype": "mat"}]},
+            # In this case, "123" indicates the starting points of the matrix
+            # load_mat can load both matrix and vector
+            if not self.keep_all_data_on_mem:
+                return kaldiio.load_mat(filepath)
+            if filepath not in self._loaders:
+                self._loaders[filepath] = kaldiio.load_mat(filepath)
+            return self._loaders[filepath]
+        elif filetype == "scp":
+            # e.g.
+            #    {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
+            #                "filetype": "scp",
+            filepath, key = filepath.split(":", 1)
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = kaldiio.load_scp(filepath)
+                self._loaders[filepath] = loader
+            return loader[key]
+        else:
+            raise NotImplementedError(
+                "Not supported: loader_type={}".format(filetype))

From 5187a93dc1389ff5faf6ba617a34a8ad88defbb7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 16 Aug 2021 11:45:10 +0000
Subject: [PATCH 05/17] remove fixed hack api

---
 deepspeech/__init__.py     | 49 --------------------------------------
 deepspeech/modules/loss.py |  3 ++-
 2 files changed, 2 insertions(+), 50 deletions(-)

diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py
index 37531657e..1316256e4 100644
--- a/deepspeech/__init__.py
+++ b/deepspeech/__init__.py
@@ -30,24 +30,13 @@ logger = Log(__name__).getlog()
 logger.warn = logger.warning
 
 ########### hcak paddle #############
-paddle.bool = 'bool'
-paddle.float16 = 'float16'
 paddle.half = 'float16'
-paddle.float32 = 'float32'
 paddle.float = 'float32'
-paddle.float64 = 'float64'
 paddle.double = 'float64'
-paddle.int8 = 'int8'
-paddle.int16 = 'int16'
 paddle.short = 'int16'
-paddle.int32 = 'int32'
 paddle.int = 'int32'
-paddle.int64 = 'int64'
 paddle.long = 'int64'
-paddle.uint8 = 'uint8'
 paddle.uint16 = 'uint16'
-paddle.complex64 = 'complex64'
-paddle.complex128 = 'complex128'
 paddle.cdouble = 'complex128'
 
 
@@ -403,45 +392,7 @@ if not hasattr(paddle.nn.functional, 'glu'):
 #         return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
 
 
-# hack loss
-def ctc_loss(logits,
-             labels,
-             input_lengths,
-             label_lengths,
-             blank=0,
-             reduction='mean',
-             norm_by_times=True):
-    #logger.info("my ctc loss with norm by times")
-    ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
-    loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
-                                           input_lengths, label_lengths)
-
-    loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
-    assert reduction in ['mean', 'sum', 'none']
-    if reduction == 'mean':
-        loss_out = paddle.mean(loss_out / label_lengths)
-    elif reduction == 'sum':
-        loss_out = paddle.sum(loss_out)
-    return loss_out
-
-
-logger.warn(
-    "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
-)
-F.ctc_loss = ctc_loss
-
 ########### hcak paddle.nn #############
-if not hasattr(paddle.nn, 'Module'):
-    logger.warn("register user Module to paddle.nn, remove this when fixed!")
-    setattr(paddle.nn, 'Module', paddle.nn.Layer)
-
-# maybe cause assert isinstance(sublayer, core.Layer)
-if not hasattr(paddle.nn, 'ModuleList'):
-    logger.warn(
-        "register user ModuleList to paddle.nn, remove this when fixed!")
-    setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList)
-
-
 class GLU(nn.Layer):
     """Gated Linear Units (GLU) Layer"""
 
diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py
index 3e441bbbc..8918ca669 100644
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@@ -48,7 +48,8 @@ class CTCLoss(nn.Layer):
         logits = logits.transpose([1, 0, 2])
         # (TODO:Hui Zhang) ctc loss does not support int64 labels
         ys_pad = ys_pad.astype(paddle.int32)
-        loss = self.loss(logits, ys_pad, hlens, ys_lens)
+        loss = self.loss(
+            logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
         if self.batch_average:
             # Batch-size average
             loss = loss / B

From c4c4110f256b06ffb4e9ebefd385767685d3a5ae Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 17 Aug 2021 06:25:51 +0000
Subject: [PATCH 06/17] fix io; add test

---
 .bashrc                           |   10 +
 .notebook/espnet_dataloader.ipynb | 1157 +++++++++++++++++++++++++++++
 deepspeech/io/batchfy.py          |   10 +-
 deepspeech/io/dataset.py          |    2 +-
 4 files changed, 1173 insertions(+), 6 deletions(-)
 create mode 100755 .bashrc
 create mode 100644 .notebook/espnet_dataloader.ipynb

diff --git a/.bashrc b/.bashrc
new file mode 100755
index 000000000..15131969a
--- /dev/null
+++ b/.bashrc
@@ -0,0 +1,10 @@
+# Locales
+
+export LC_ALL=en_US.UTF-8
+export LANG=en_US.UTF-8
+export LANGUAGE=en_US.UTF-8
+
+# Aliases
+alias nvs="nvidia-smi"
+alias rsync="rsync --progress -raz"
+alias his="history"
diff --git a/.notebook/espnet_dataloader.ipynb b/.notebook/espnet_dataloader.ipynb
new file mode 100644
index 000000000..5d1829794
--- /dev/null
+++ b/.notebook/espnet_dataloader.ipynb
@@ -0,0 +1,1157 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "extensive-venice",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'/workspace/DeepSpeech-2.x'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%cd ..\n",
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "correct-window",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "manifest.dev\t  manifest.test-clean\t   manifest.train\r\n",
+      "manifest.dev.raw  manifest.test-clean.raw  manifest.train.raw\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls /workspace/DeepSpeech-2.x/examples/librispeech/s2/data/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "exceptional-cheese",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dev_data='/workspace/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "extraordinary-orleans",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "register user softmax to paddle, remove this when fixed!\n",
+      "register user log_softmax to paddle, remove this when fixed!\n",
+      "register user sigmoid to paddle, remove this when fixed!\n",
+      "register user log_sigmoid to paddle, remove this when fixed!\n",
+      "register user relu to paddle, remove this when fixed!\n",
+      "override cat of paddle if exists or register, remove this when fixed!\n",
+      "override long of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "override eq of paddle if exists or register, remove this when fixed!\n",
+      "override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
+      "register user view to paddle.Tensor, remove this when fixed!\n",
+      "register user view_as to paddle.Tensor, remove this when fixed!\n",
+      "register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "register user fill_ to paddle.Tensor, remove this when fixed!\n",
+      "register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "register user softmax to paddle.Tensor, remove this when fixed!\n",
+      "register user sigmoid to paddle.Tensor, remove this when fixed!\n",
+      "register user relu to paddle.Tensor, remove this when fixed!\n",
+      "register user type_as to paddle.Tensor, remove this when fixed!\n",
+      "register user to to paddle.Tensor, remove this when fixed!\n",
+      "register user float to paddle.Tensor, remove this when fixed!\n",
+      "register user int to paddle.Tensor, remove this when fixed!\n",
+      "register user GLU to paddle.nn, remove this when fixed!\n",
+      "register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "register user export to paddle.jit, remove this when fixed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from deepspeech.frontend.utility import read_manifest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "returning-lighter",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "dev_json = read_manifest(dev_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "western-founder",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'input': [{'feat': '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.1.ark:16',\n",
+      "            'name': 'input1',\n",
+      "            'shape': [1063, 83]}],\n",
+      " 'output': [{'name': 'target1',\n",
+      "             'shape': [41, 5002],\n",
+      "             'text': 'AS I APPROACHED THE CITY I HEARD BELLS RINGING AND A '\n",
+      "                     'LITTLE LATER I FOUND THE STREETS ASTIR WITH THRONGS OF '\n",
+      "                     'WELL DRESSED PEOPLE IN FAMILY GROUPS WENDING THEIR WAY '\n",
+      "                     'HITHER AND THITHER',\n",
+      "             'token': '▁AS ▁I ▁APPROACHED ▁THE ▁CITY ▁I ▁HEARD ▁BELL S ▁RING '\n",
+      "                      'ING ▁AND ▁A ▁LITTLE ▁LATER ▁I ▁FOUND ▁THE ▁STREETS ▁AS '\n",
+      "                      'T IR ▁WITH ▁THRONG S ▁OF ▁WELL ▁DRESSED ▁PEOPLE ▁IN '\n",
+      "                      '▁FAMILY ▁GROUP S ▁WE ND ING ▁THEIR ▁WAY ▁HITHER ▁AND '\n",
+      "                      '▁THITHER',\n",
+      "             'tokenid': '713 2458 676 4502 1155 2458 2351 849 389 3831 206 627 '\n",
+      "                        '482 2812 2728 2458 2104 4502 4316 713 404 212 4925 '\n",
+      "                        '4549 389 3204 4861 1677 3339 2495 1950 2279 389 4845 '\n",
+      "                        '302 206 4504 4843 2394 627 4526'}],\n",
+      " 'utt': '116-288045-0000',\n",
+      " 'utt2spk': '116-288045'}\n",
+      "5542\n",
+      "<class 'list'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pprint import pprint\n",
+    "pprint(dev_json[0])\n",
+    "print(len(dev_json))\n",
+    "print(type(dev_json))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "motivated-receptor",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "import itertools\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from deepspeech.utils.log import Log\n",
+    "\n",
+    "__all__ = [\"make_batchset\"]\n",
+    "\n",
+    "logger = Log(__name__).getlog()\n",
+    "\n",
+    "\n",
+    "def batchfy_by_seq(\n",
+    "        sorted_data,\n",
+    "        batch_size,\n",
+    "        max_length_in,\n",
+    "        max_length_out,\n",
+    "        min_batch_size=1,\n",
+    "        shortest_first=False,\n",
+    "        ikey=\"input\",\n",
+    "        iaxis=0,\n",
+    "        okey=\"output\",\n",
+    "        oaxis=0, ):\n",
+    "    \"\"\"Make batch set from json dictionary\n",
+    "\n",
+    "    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json\n",
+    "    :param int batch_size: batch size\n",
+    "    :param int max_length_in: maximum length of input to decide adaptive batch size\n",
+    "    :param int max_length_out: maximum length of output to decide adaptive batch size\n",
+    "    :param int min_batch_size: mininum batch size (for multi-gpu)\n",
+    "    :param bool shortest_first: Sort from batch with shortest samples\n",
+    "        to longest if true, otherwise reverse\n",
+    "    :param str ikey: key to access input\n",
+    "        (for ASR ikey=\"input\", for TTS, MT ikey=\"output\".)\n",
+    "    :param int iaxis: dimension to access input\n",
+    "        (for ASR, TTS iaxis=0, for MT iaxis=\"1\".)\n",
+    "    :param str okey: key to access output\n",
+    "        (for ASR, MT okey=\"output\". for TTS okey=\"input\".)\n",
+    "    :param int oaxis: dimension to access output\n",
+    "        (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)\n",
+    "    :return: List[List[Tuple[str, dict]]] list of batches\n",
+    "    \"\"\"\n",
+    "    if batch_size <= 0:\n",
+    "        raise ValueError(f\"Invalid batch_size={batch_size}\")\n",
+    "\n",
+    "    # check #utts is more than min_batch_size\n",
+    "    if len(sorted_data) < min_batch_size:\n",
+    "        raise ValueError(\n",
+    "            f\"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size}).\"\n",
+    "        )\n",
+    "\n",
+    "    # make list of minibatches\n",
+    "    minibatches = []\n",
+    "    start = 0\n",
+    "    while True:\n",
+    "        _, info = sorted_data[start]\n",
+    "        ilen = int(info[ikey][iaxis][\"shape\"][0])\n",
+    "        olen = (int(info[okey][oaxis][\"shape\"][0]) if oaxis >= 0 else\n",
+    "                max(map(lambda x: int(x[\"shape\"][0]), info[okey])))\n",
+    "        factor = max(int(ilen / max_length_in), int(olen / max_length_out))\n",
+    "        # change batchsize depending on the input and output length\n",
+    "        # if ilen = 1000 and max_length_in = 800\n",
+    "        # then b = batchsize / 2\n",
+    "        # and max(min_batches, .) avoids batchsize = 0\n",
+    "        bs = max(min_batch_size, int(batch_size / (1 + factor)))\n",
+    "        end = min(len(sorted_data), start + bs)\n",
+    "        minibatch = sorted_data[start:end]\n",
+    "        if shortest_first:\n",
+    "            minibatch.reverse()\n",
+    "\n",
+    "        # check each batch is more than minimum batchsize\n",
+    "        if len(minibatch) < min_batch_size:\n",
+    "            mod = min_batch_size - len(minibatch) % min_batch_size\n",
+    "            additional_minibatch = [\n",
+    "                sorted_data[i] for i in np.random.randint(0, start, mod)\n",
+    "            ]\n",
+    "            if shortest_first:\n",
+    "                additional_minibatch.reverse()\n",
+    "            minibatch.extend(additional_minibatch)\n",
+    "        minibatches.append(minibatch)\n",
+    "\n",
+    "        if end == len(sorted_data):\n",
+    "            break\n",
+    "        start = end\n",
+    "\n",
+    "    # batch: List[List[Tuple[str, dict]]]\n",
+    "    return minibatches\n",
+    "\n",
+    "\n",
+    "def batchfy_by_bin(\n",
+    "        sorted_data,\n",
+    "        batch_bins,\n",
+    "        num_batches=0,\n",
+    "        min_batch_size=1,\n",
+    "        shortest_first=False,\n",
+    "        ikey=\"input\",\n",
+    "        okey=\"output\", ):\n",
+    "    \"\"\"Make variably sized batch set, which maximizes\n",
+    "\n",
+    "    the number of bins up to `batch_bins`.\n",
+    "\n",
+    "    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json\n",
+    "    :param int batch_bins: Maximum frames of a batch\n",
+    "    :param int num_batches: # number of batches to use (for debug)\n",
+    "    :param int min_batch_size: minimum batch size (for multi-gpu)\n",
+    "    :param int test: Return only every `test` batches\n",
+    "    :param bool shortest_first: Sort from batch with shortest samples\n",
+    "        to longest if true, otherwise reverse\n",
+    "\n",
+    "    :param str ikey: key to access input (for ASR ikey=\"input\", for TTS ikey=\"output\".)\n",
+    "    :param str okey: key to access output (for ASR okey=\"output\". for TTS okey=\"input\".)\n",
+    "\n",
+    "    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches\n",
+    "    \"\"\"\n",
+    "    if batch_bins <= 0:\n",
+    "        raise ValueError(f\"invalid batch_bins={batch_bins}\")\n",
+    "    length = len(sorted_data)\n",
+    "    idim = int(sorted_data[0][1][ikey][0][\"shape\"][1])\n",
+    "    odim = int(sorted_data[0][1][okey][0][\"shape\"][1])\n",
+    "    logger.info(\"# utts: \" + str(len(sorted_data)))\n",
+    "    minibatches = []\n",
+    "    start = 0\n",
+    "    n = 0\n",
+    "    while True:\n",
+    "        # Dynamic batch size depending on size of samples\n",
+    "        b = 0\n",
+    "        next_size = 0\n",
+    "        max_olen = 0\n",
+    "        while next_size < batch_bins and (start + b) < length:\n",
+    "            ilen = int(sorted_data[start + b][1][ikey][0][\"shape\"][0]) * idim\n",
+    "            olen = int(sorted_data[start + b][1][okey][0][\"shape\"][0]) * odim\n",
+    "            if olen > max_olen:\n",
+    "                max_olen = olen\n",
+    "            next_size = (max_olen + ilen) * (b + 1)\n",
+    "            if next_size <= batch_bins:\n",
+    "                b += 1\n",
+    "            elif next_size == 0:\n",
+    "                raise ValueError(\n",
+    "                    f\"Can't fit one sample in batch_bins ({batch_bins}): \"\n",
+    "                    f\"Please increase the value\")\n",
+    "        end = min(length, start + max(min_batch_size, b))\n",
+    "        batch = sorted_data[start:end]\n",
+    "        if shortest_first:\n",
+    "            batch.reverse()\n",
+    "        minibatches.append(batch)\n",
+    "        # Check for min_batch_size and fixes the batches if needed\n",
+    "        i = -1\n",
+    "        while len(minibatches[i]) < min_batch_size:\n",
+    "            missing = min_batch_size - len(minibatches[i])\n",
+    "            if -i == len(minibatches):\n",
+    "                minibatches[i + 1].extend(minibatches[i])\n",
+    "                minibatches = minibatches[1:]\n",
+    "                break\n",
+    "            else:\n",
+    "                minibatches[i].extend(minibatches[i - 1][:missing])\n",
+    "                minibatches[i - 1] = minibatches[i - 1][missing:]\n",
+    "                i -= 1\n",
+    "        if end == length:\n",
+    "            break\n",
+    "        start = end\n",
+    "        n += 1\n",
+    "    if num_batches > 0:\n",
+    "        minibatches = minibatches[:num_batches]\n",
+    "    lengths = [len(x) for x in minibatches]\n",
+    "    logger.info(\n",
+    "        str(len(minibatches)) + \" batches containing from \" + str(min(lengths))\n",
+    "        + \" to \" + str(max(lengths)) + \" samples \" + \"(avg \" + str(\n",
+    "            int(np.mean(lengths))) + \" samples).\")\n",
+    "    return minibatches\n",
+    "\n",
+    "\n",
+    "def batchfy_by_frame(\n",
+    "        sorted_data,\n",
+    "        max_frames_in,\n",
+    "        max_frames_out,\n",
+    "        max_frames_inout,\n",
+    "        num_batches=0,\n",
+    "        min_batch_size=1,\n",
+    "        shortest_first=False,\n",
+    "        ikey=\"input\",\n",
+    "        okey=\"output\", ):\n",
+    "    \"\"\"Make variable batch set, which maximizes the number of frames to max_batch_frame.\n",
+    "\n",
+    "    :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json\n",
+    "    :param int max_frames_in: Maximum input frames of a batch\n",
+    "    :param int max_frames_out: Maximum output frames of a batch\n",
+    "    :param int max_frames_inout: Maximum input+output frames of a batch\n",
+    "    :param int num_batches: # number of batches to use (for debug)\n",
+    "    :param int min_batch_size: minimum batch size (for multi-gpu)\n",
+    "    :param int test: Return only every `test` batches\n",
+    "    :param bool shortest_first: Sort from batch with shortest samples\n",
+    "        to longest if true, otherwise reverse\n",
+    "\n",
+    "    :param str ikey: key to access input (for ASR ikey=\"input\", for TTS ikey=\"output\".)\n",
+    "    :param str okey: key to access output (for ASR okey=\"output\". for TTS okey=\"input\".)\n",
+    "\n",
+    "    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches\n",
+    "    \"\"\"\n",
+    "    if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:\n",
+    "        raise ValueError(\n",
+    "            \"At least, one of `--batch-frames-in`, `--batch-frames-out` or \"\n",
+    "            \"`--batch-frames-inout` should be > 0\")\n",
+    "    length = len(sorted_data)\n",
+    "    minibatches = []\n",
+    "    start = 0\n",
+    "    end = 0\n",
+    "    while end != length:\n",
+    "        # Dynamic batch size depending on size of samples\n",
+    "        b = 0\n",
+    "        max_olen = 0\n",
+    "        max_ilen = 0\n",
+    "        while (start + b) < length:\n",
+    "            ilen = int(sorted_data[start + b][1][ikey][0][\"shape\"][0])\n",
+    "            if ilen > max_frames_in and max_frames_in != 0:\n",
+    "                raise ValueError(\n",
+    "                    f\"Can't fit one sample in --batch-frames-in ({max_frames_in}): \"\n",
+    "                    f\"Please increase the value\")\n",
+    "            olen = int(sorted_data[start + b][1][okey][0][\"shape\"][0])\n",
+    "            if olen > max_frames_out and max_frames_out != 0:\n",
+    "                raise ValueError(\n",
+    "                    f\"Can't fit one sample in --batch-frames-out ({max_frames_out}): \"\n",
+    "                    f\"Please increase the value\")\n",
+    "            if ilen + olen > max_frames_inout and max_frames_inout != 0:\n",
+    "                raise ValueError(\n",
+    "                    f\"Can't fit one sample in --batch-frames-out ({max_frames_inout}): \"\n",
+    "                    f\"Please increase the value\")\n",
+    "            max_olen = max(max_olen, olen)\n",
+    "            max_ilen = max(max_ilen, ilen)\n",
+    "            in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0\n",
+    "            out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0\n",
+    "            inout_ok = (max_ilen + max_olen) * (\n",
+    "                b + 1) <= max_frames_inout or max_frames_inout == 0\n",
+    "            if in_ok and out_ok and inout_ok:\n",
+    "                # add more seq in the minibatch\n",
+    "                b += 1\n",
+    "            else:\n",
+    "                # no more seq in the minibatch\n",
+    "                break\n",
+    "        end = min(length, start + b)\n",
+    "        batch = sorted_data[start:end]\n",
+    "        if shortest_first:\n",
+    "            batch.reverse()\n",
+    "        minibatches.append(batch)\n",
+    "        # Check for min_batch_size and fixes the batches if needed\n",
+    "        i = -1\n",
+    "        while len(minibatches[i]) < min_batch_size:\n",
+    "            missing = min_batch_size - len(minibatches[i])\n",
+    "            if -i == len(minibatches):\n",
+    "                minibatches[i + 1].extend(minibatches[i])\n",
+    "                minibatches = minibatches[1:]\n",
+    "                break\n",
+    "            else:\n",
+    "                minibatches[i].extend(minibatches[i - 1][:missing])\n",
+    "                minibatches[i - 1] = minibatches[i - 1][missing:]\n",
+    "                i -= 1\n",
+    "        start = end\n",
+    "    if num_batches > 0:\n",
+    "        minibatches = minibatches[:num_batches]\n",
+    "    lengths = [len(x) for x in minibatches]\n",
+    "    logger.info(\n",
+    "        str(len(minibatches)) + \" batches containing from \" + str(min(lengths))\n",
+    "        + \" to \" + str(max(lengths)) + \" samples\" + \"(avg \" + str(\n",
+    "            int(np.mean(lengths))) + \" samples).\")\n",
+    "\n",
+    "    return minibatches\n",
+    "\n",
+    "\n",
+    "def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,\n",
+    "                    shortest_first):\n",
+    "    import random\n",
+    "\n",
+    "    logger.info(\"use shuffled batch.\")\n",
+    "    sorted_data = random.sample(data.items(), len(data.items()))\n",
+    "    logger.info(\"# utts: \" + str(len(sorted_data)))\n",
+    "    # make list of minibatches\n",
+    "    minibatches = []\n",
+    "    start = 0\n",
+    "    while True:\n",
+    "        end = min(len(sorted_data), start + batch_size)\n",
+    "        # check each batch is more than minimum batchsize\n",
+    "        minibatch = sorted_data[start:end]\n",
+    "        if shortest_first:\n",
+    "            minibatch.reverse()\n",
+    "        if len(minibatch) < min_batch_size:\n",
+    "            mod = min_batch_size - len(minibatch) % min_batch_size\n",
+    "            additional_minibatch = [\n",
+    "                sorted_data[i] for i in np.random.randint(0, start, mod)\n",
+    "            ]\n",
+    "            if shortest_first:\n",
+    "                additional_minibatch.reverse()\n",
+    "            minibatch.extend(additional_minibatch)\n",
+    "        minibatches.append(minibatch)\n",
+    "        if end == len(sorted_data):\n",
+    "            break\n",
+    "        start = end\n",
+    "\n",
+    "    # for debugging\n",
+    "    if num_batches > 0:\n",
+    "        minibatches = minibatches[:num_batches]\n",
+    "        logger.info(\"# minibatches: \" + str(len(minibatches)))\n",
+    "    return minibatches\n",
+    "\n",
+    "\n",
+    "BATCH_COUNT_CHOICES = [\"auto\", \"seq\", \"bin\", \"frame\"]\n",
+    "BATCH_SORT_KEY_CHOICES = [\"input\", \"output\", \"shuffle\"]\n",
+    "\n",
+    "\n",
+    "def make_batchset(\n",
+    "        data,\n",
+    "        batch_size=0,\n",
+    "        max_length_in=float(\"inf\"),\n",
+    "        max_length_out=float(\"inf\"),\n",
+    "        num_batches=0,\n",
+    "        min_batch_size=1,\n",
+    "        shortest_first=False,\n",
+    "        batch_sort_key=\"input\",\n",
+    "        count=\"auto\",\n",
+    "        batch_bins=0,\n",
+    "        batch_frames_in=0,\n",
+    "        batch_frames_out=0,\n",
+    "        batch_frames_inout=0,\n",
+    "        iaxis=0,\n",
+    "        oaxis=0, ):\n",
+    "    \"\"\"Make batch set from json dictionary\n",
+    "\n",
+    "    if utts have \"category\" value,\n",
+    "\n",
+    "        >>> data = {'utt1': {'category': 'A', 'input': ...},\n",
+    "        ...         'utt2': {'category': 'B', 'input': ...},\n",
+    "        ...         'utt3': {'category': 'B', 'input': ...},\n",
+    "        ...         'utt4': {'category': 'A', 'input': ...}}\n",
+    "        >>> make_batchset(data, batchsize=2, ...)\n",
+    "        [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]\n",
+    "\n",
+    "    Note that if any utts doesn't have \"category\",\n",
+    "    perform as same as batchfy_by_{count}\n",
+    "\n",
+    "    :param List[Dict[str, Any]] data: dictionary loaded from data.json\n",
+    "    :param int batch_size: maximum number of sequences in a minibatch.\n",
+    "    :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.\n",
+    "    :param int batch_frames_in:  maximum number of input frames in a minibatch.\n",
+    "    :param int batch_frames_out: maximum number of output frames in a minibatch.\n",
+    "    :param int batch_frames_out: maximum number of input+output frames in a minibatch.\n",
+    "    :param str count: strategy to count maximum size of batch.\n",
+    "        For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES\n",
+    "\n",
+    "    :param int max_length_in: maximum length of input to decide adaptive batch size\n",
+    "    :param int max_length_out: maximum length of output to decide adaptive batch size\n",
+    "    :param int num_batches: # number of batches to use (for debug)\n",
+    "    :param int min_batch_size: minimum batch size (for multi-gpu)\n",
+    "    :param bool shortest_first: Sort from batch with shortest samples\n",
+    "        to longest if true, otherwise reverse\n",
+    "    :param str batch_sort_key: how to sort data before creating minibatches\n",
+    "        [\"input\", \"output\", \"shuffle\"]\n",
+    "    :param bool swap_io: if True, use \"input\" as output and \"output\"\n",
+    "        as input in `data` dict\n",
+    "    :param bool mt: if True, use 0-axis of \"output\" as output and 1-axis of \"output\"\n",
+    "        as input in `data` dict\n",
+    "    :param int iaxis: dimension to access input\n",
+    "        (for ASR, TTS iaxis=0, for MT iaxis=\"1\".)\n",
+    "    :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,\n",
+    "        reserved for future research, -1 means all axis.)\n",
+    "    :return: List[List[Tuple[str, dict]]] list of batches\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # check args\n",
+    "    if count not in BATCH_COUNT_CHOICES:\n",
+    "        raise ValueError(\n",
+    "            f\"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}\")\n",
+    "    if batch_sort_key not in BATCH_SORT_KEY_CHOICES:\n",
+    "        raise ValueError(f\"arg 'batch_sort_key' ({batch_sort_key}) should be \"\n",
+    "                         f\"one of {BATCH_SORT_KEY_CHOICES}\")\n",
+    "\n",
+    "    ikey = \"input\"\n",
+    "    okey = \"output\"\n",
+    "    batch_sort_axis = 0  # index of list \n",
+    "\n",
+    "    if count == \"auto\":\n",
+    "        if batch_size != 0:\n",
+    "            count = \"seq\"\n",
+    "        elif batch_bins != 0:\n",
+    "            count = \"bin\"\n",
+    "        elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:\n",
+    "            count = \"frame\"\n",
+    "        else:\n",
+    "            raise ValueError(\n",
+    "                f\"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}\"\n",
+    "            )\n",
+    "        logger.info(f\"count is auto detected as {count}\")\n",
+    "\n",
+    "    if count != \"seq\" and batch_sort_key == \"shuffle\":\n",
+    "        raise ValueError(\n",
+    "            \"batch_sort_key=shuffle is only available if batch_count=seq\")\n",
+    "\n",
+    "    category2data = {}  # Dict[str, dict]\n",
+    "    for v in data:\n",
+    "        k = v['utt']\n",
+    "        category2data.setdefault(v.get(\"category\"), {})[k] = v\n",
+    "\n",
+    "    batches_list = []  # List[List[List[Tuple[str, dict]]]]\n",
+    "    for d in category2data.values():\n",
+    "        if batch_sort_key == \"shuffle\":\n",
+    "            batches = batchfy_shuffle(d, batch_size, min_batch_size,\n",
+    "                                      num_batches, shortest_first)\n",
+    "            batches_list.append(batches)\n",
+    "            continue\n",
+    "\n",
+    "        # sort it by input lengths (long to short)\n",
+    "        sorted_data = sorted(\n",
+    "            d.items(),\n",
+    "            key=lambda data: int(data[1][batch_sort_key][batch_sort_axis][\"shape\"][0]),\n",
+    "            reverse=not shortest_first, )\n",
+    "        logger.info(\"# utts: \" + str(len(sorted_data)))\n",
+    "        \n",
+    "        if count == \"seq\":\n",
+    "            batches = batchfy_by_seq(\n",
+    "                sorted_data,\n",
+    "                batch_size=batch_size,\n",
+    "                max_length_in=max_length_in,\n",
+    "                max_length_out=max_length_out,\n",
+    "                min_batch_size=min_batch_size,\n",
+    "                shortest_first=shortest_first,\n",
+    "                ikey=ikey,\n",
+    "                iaxis=iaxis,\n",
+    "                okey=okey,\n",
+    "                oaxis=oaxis, )\n",
+    "        if count == \"bin\":\n",
+    "            batches = batchfy_by_bin(\n",
+    "                sorted_data,\n",
+    "                batch_bins=batch_bins,\n",
+    "                min_batch_size=min_batch_size,\n",
+    "                shortest_first=shortest_first,\n",
+    "                ikey=ikey,\n",
+    "                okey=okey, )\n",
+    "        if count == \"frame\":\n",
+    "            batches = batchfy_by_frame(\n",
+    "                sorted_data,\n",
+    "                max_frames_in=batch_frames_in,\n",
+    "                max_frames_out=batch_frames_out,\n",
+    "                max_frames_inout=batch_frames_inout,\n",
+    "                min_batch_size=min_batch_size,\n",
+    "                shortest_first=shortest_first,\n",
+    "                ikey=ikey,\n",
+    "                okey=okey, )\n",
+    "        batches_list.append(batches)\n",
+    "\n",
+    "    if len(batches_list) == 1:\n",
+    "        batches = batches_list[0]\n",
+    "    else:\n",
+    "        # Concat list. This way is faster than \"sum(batch_list, [])\"\n",
+    "        batches = list(itertools.chain(*batches_list))\n",
+    "\n",
+    "    # for debugging\n",
+    "    if num_batches > 0:\n",
+    "        batches = batches[:num_batches]\n",
+    "    logger.info(\"# minibatches: \" + str(len(batches)))\n",
+    "\n",
+    "    # batch: List[List[Tuple[str, dict]]]\n",
+    "    return batches\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "acquired-hurricane",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:284] use shuffled batch.\n",
+      "[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:286] # utts: 5542\n",
+      "[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:467] # minibatches: 555\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "555\n"
+     ]
+    }
+   ],
+   "source": [
+    "batch_size=10\n",
+    "maxlen_in=300\n",
+    "maxlen_out=400\n",
+    "minibatches=0  # for debug\n",
+    "min_batch_size=2\n",
+    "use_sortagrad=True\n",
+    "batch_count='seq'\n",
+    "batch_bins=0\n",
+    "batch_frames_in=3000\n",
+    "batch_frames_out=0\n",
+    "batch_frames_inout=0\n",
+    "            \n",
+    "dev_data = make_batchset(\n",
+    "            dev_json,\n",
+    "            batch_size,\n",
+    "            maxlen_in,\n",
+    "            maxlen_out,\n",
+    "            minibatches,  # for debug\n",
+    "            min_batch_size=min_batch_size,\n",
+    "            shortest_first=use_sortagrad,\n",
+    "            batch_sort_key=\"shuffle\",\n",
+    "            count=batch_count,\n",
+    "            batch_bins=batch_bins,\n",
+    "            batch_frames_in=batch_frames_in,\n",
+    "            batch_frames_out=batch_frames_out,\n",
+    "            batch_frames_inout=batch_frames_inout,\n",
+    "            iaxis=0,\n",
+    "            oaxis=0, )\n",
+    "print(len(dev_data))\n",
+    "# for i in range(len(dev_data)):\n",
+    "#     print(len(dev_data[i]))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "warming-malpractice",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting kaldiio\n",
+      "  Downloading kaldiio-2.17.2.tar.gz (24 kB)\n",
+      "Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages (from kaldiio) (1.20.1)\n",
+      "Building wheels for collected packages: kaldiio\n",
+      "  Building wheel for kaldiio (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24469 sha256=aadc8b1a8de5c9769af065ae724fb11326691d2350145019f6e3dba69f020134\n",
+      "  Stored in directory: /root/.cache/pip/wheels/04/07/e8/45641287c59bf6ce41e22259f8680b521c31e6306cb88392ac\n",
+      "Successfully built kaldiio\n",
+      "Installing collected packages: kaldiio\n",
+      "Successfully installed kaldiio-2.17.2\n",
+      "\u001b[33mWARNING: You are using pip version 20.0.1; however, version 21.2.4 is available.\n",
+      "You should consider upgrading via the '/workspace/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install kaldiio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "equipped-subject",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "superb-methodology",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import OrderedDict\n",
+    "import kaldiio\n",
+    "\n",
+    "class LoadInputsAndTargets():\n",
+    "    \"\"\"Create a mini-batch from a list of dicts\n",
+    "\n",
+    "    >>> batch = [('utt1',\n",
+    "    ...           dict(input=[dict(feat='some.ark:123',\n",
+    "    ...                            filetype='mat',\n",
+    "    ...                            name='input1',\n",
+    "    ...                            shape=[100, 80])],\n",
+    "    ...                output=[dict(tokenid='1 2 3 4',\n",
+    "    ...                             name='target1',\n",
+    "    ...                             shape=[4, 31])]]))\n",
+    "    >>> l = LoadInputsAndTargets()\n",
+    "    >>> feat, target = l(batch)\n",
+    "\n",
+    "    :param: str mode: Specify the task mode, \"asr\" or \"tts\"\n",
+    "    :param: str preprocess_conf: The path of a json file for pre-processing\n",
+    "    :param: bool load_input: If False, not to load the input data\n",
+    "    :param: bool load_output: If False, not to load the output data\n",
+    "    :param: bool sort_in_input_length: Sort the mini-batch in descending order\n",
+    "        of the input length\n",
+    "    :param: bool use_speaker_embedding: Used for tts mode only\n",
+    "    :param: bool use_second_target: Used for tts mode only\n",
+    "    :param: dict preprocess_args: Set some optional arguments for preprocessing\n",
+    "    :param: Optional[dict] preprocess_args: Used for tts mode only\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "            self,\n",
+    "            mode=\"asr\",\n",
+    "            preprocess_conf=None,\n",
+    "            load_input=True,\n",
+    "            load_output=True,\n",
+    "            sort_in_input_length=True,\n",
+    "            preprocess_args=None,\n",
+    "            keep_all_data_on_mem=False, ):\n",
+    "        self._loaders = {}\n",
+    "\n",
+    "        if mode not in [\"asr\"]:\n",
+    "            raise ValueError(\"Only asr are allowed: mode={}\".format(mode))\n",
+    "\n",
+    "        if preprocess_conf is not None:\n",
+    "            self.preprocessing = AugmentationPipeline(preprocess_conf)\n",
+    "            logging.warning(\n",
+    "                \"[Experimental feature] Some preprocessing will be done \"\n",
+    "                \"for the mini-batch creation using {}\".format(\n",
+    "                    self.preprocessing))\n",
+    "        else:\n",
+    "            # If conf doesn't exist, this function don't touch anything.\n",
+    "            self.preprocessing = None\n",
+    "\n",
+    "        self.mode = mode\n",
+    "        self.load_output = load_output\n",
+    "        self.load_input = load_input\n",
+    "        self.sort_in_input_length = sort_in_input_length\n",
+    "        if preprocess_args is None:\n",
+    "            self.preprocess_args = {}\n",
+    "        else:\n",
+    "            assert isinstance(preprocess_args, dict), type(preprocess_args)\n",
+    "            self.preprocess_args = dict(preprocess_args)\n",
+    "\n",
+    "        self.keep_all_data_on_mem = keep_all_data_on_mem\n",
+    "\n",
+    "    def __call__(self, batch, return_uttid=False):\n",
+    "        \"\"\"Function to load inputs and targets from list of dicts\n",
+    "\n",
+    "        :param List[Tuple[str, dict]] batch: list of dict which is subset of\n",
+    "            loaded data.json\n",
+    "        :param bool return_uttid: return utterance ID information for visualization\n",
+    "        :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]\n",
+    "        :return: list of input feature sequences\n",
+    "            [(T_1, D), (T_2, D), ..., (T_B, D)]\n",
+    "        :rtype: list of float ndarray\n",
+    "        :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]\n",
+    "        :rtype: list of int ndarray\n",
+    "\n",
+    "        \"\"\"\n",
+    "        x_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]\n",
+    "        y_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]\n",
+    "        uttid_list = []  # List[str]\n",
+    "\n",
+    "        for uttid, info in batch:\n",
+    "            uttid_list.append(uttid)\n",
+    "\n",
+    "            if self.load_input:\n",
+    "                # Note(kamo): This for-loop is for multiple inputs\n",
+    "                for idx, inp in enumerate(info[\"input\"]):\n",
+    "                    # {\"input\":\n",
+    "                    #  [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n",
+    "                    #    \"filetype\": \"hdf5\",\n",
+    "                    #    \"name\": \"input1\", ...}], ...}\n",
+    "                    x = self._get_from_loader(\n",
+    "                        filepath=inp[\"feat\"],\n",
+    "                        filetype=inp.get(\"filetype\", \"mat\"))\n",
+    "                    x_feats_dict.setdefault(inp[\"name\"], []).append(x)\n",
+    "\n",
+    "            if self.load_output:\n",
+    "                for idx, inp in enumerate(info[\"output\"]):\n",
+    "                    if \"tokenid\" in inp:\n",
+    "                        # ======= Legacy format for output =======\n",
+    "                        # {\"output\": [{\"tokenid\": \"1 2 3 4\"}])\n",
+    "                        x = np.fromiter(\n",
+    "                            map(int, inp[\"tokenid\"].split()), dtype=np.int64)\n",
+    "                    else:\n",
+    "                        # ======= New format =======\n",
+    "                        # {\"input\":\n",
+    "                        #  [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n",
+    "                        #    \"filetype\": \"hdf5\",\n",
+    "                        #    \"name\": \"target1\", ...}], ...}\n",
+    "                        x = self._get_from_loader(\n",
+    "                            filepath=inp[\"feat\"],\n",
+    "                            filetype=inp.get(\"filetype\", \"mat\"))\n",
+    "\n",
+    "                    y_feats_dict.setdefault(inp[\"name\"], []).append(x)\n",
+    "\n",
+    "        if self.mode == \"asr\":\n",
+    "            return_batch, uttid_list = self._create_batch_asr(\n",
+    "                x_feats_dict, y_feats_dict, uttid_list)\n",
+    "        else:\n",
+    "            raise NotImplementedError(self.mode)\n",
+    "\n",
+    "        if self.preprocessing is not None:\n",
+    "            # Apply pre-processing all input features\n",
+    "            for x_name in return_batch.keys():\n",
+    "                if x_name.startswith(\"input\"):\n",
+    "                    return_batch[x_name] = self.preprocessing(\n",
+    "                        return_batch[x_name], uttid_list,\n",
+    "                        **self.preprocess_args)\n",
+    "\n",
+    "        if return_uttid:\n",
+    "            return tuple(return_batch.values()), uttid_list\n",
+    "\n",
+    "        # Doesn't return the names now.\n",
+    "        return tuple(return_batch.values())\n",
+    "\n",
+    "    def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):\n",
+    "        \"\"\"Create a OrderedDict for the mini-batch\n",
+    "\n",
+    "        :param OrderedDict x_feats_dict:\n",
+    "            e.g. {\"input1\": [ndarray, ndarray, ...],\n",
+    "                  \"input2\": [ndarray, ndarray, ...]}\n",
+    "        :param OrderedDict y_feats_dict:\n",
+    "            e.g. {\"target1\": [ndarray, ndarray, ...],\n",
+    "                  \"target2\": [ndarray, ndarray, ...]}\n",
+    "        :param: List[str] uttid_list:\n",
+    "            Give uttid_list to sort in the same order as the mini-batch\n",
+    "        :return: batch, uttid_list\n",
+    "        :rtype: Tuple[OrderedDict, List[str]]\n",
+    "        \"\"\"\n",
+    "        # handle single-input and multi-input (paralell) asr mode\n",
+    "        xs = list(x_feats_dict.values())\n",
+    "\n",
+    "        if self.load_output:\n",
+    "            ys = list(y_feats_dict.values())\n",
+    "            assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))\n",
+    "\n",
+    "            # get index of non-zero length samples\n",
+    "            nonzero_idx = list(\n",
+    "                filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))\n",
+    "            for n in range(1, len(y_feats_dict)):\n",
+    "                nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)\n",
+    "        else:\n",
+    "            # Note(kamo): Be careful not to make nonzero_idx to a generator\n",
+    "            nonzero_idx = list(range(len(xs[0])))\n",
+    "\n",
+    "        if self.sort_in_input_length:\n",
+    "            # sort in input lengths based on the first input\n",
+    "            nonzero_sorted_idx = sorted(\n",
+    "                nonzero_idx, key=lambda i: -len(xs[0][i]))\n",
+    "        else:\n",
+    "            nonzero_sorted_idx = nonzero_idx\n",
+    "\n",
+    "        if len(nonzero_sorted_idx) != len(xs[0]):\n",
+    "            logging.warning(\n",
+    "                \"Target sequences include empty tokenid (batch {} -> {}).\".\n",
+    "                format(len(xs[0]), len(nonzero_sorted_idx)))\n",
+    "\n",
+    "        # remove zero-length samples\n",
+    "        xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]\n",
+    "        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]\n",
+    "\n",
+    "        x_names = list(x_feats_dict.keys())\n",
+    "        if self.load_output:\n",
+    "            ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]\n",
+    "            y_names = list(y_feats_dict.keys())\n",
+    "\n",
+    "            # Keeping x_name and y_name, e.g. input1, for future extension\n",
+    "            return_batch = OrderedDict([\n",
+    "                * [(x_name, x) for x_name, x in zip(x_names, xs)],\n",
+    "                * [(y_name, y) for y_name, y in zip(y_names, ys)],\n",
+    "            ])\n",
+    "        else:\n",
+    "            return_batch = OrderedDict(\n",
+    "                [(x_name, x) for x_name, x in zip(x_names, xs)])\n",
+    "        return return_batch, uttid_list\n",
+    "\n",
+    "    def _get_from_loader(self, filepath, filetype):\n",
+    "        \"\"\"Return ndarray\n",
+    "\n",
+    "        In order to make the fds to be opened only at the first referring,\n",
+    "        the loader are stored in self._loaders\n",
+    "\n",
+    "        >>> ndarray = loader.get_from_loader(\n",
+    "        ...     'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')\n",
+    "\n",
+    "        :param: str filepath:\n",
+    "        :param: str filetype:\n",
+    "        :return:\n",
+    "        :rtype: np.ndarray\n",
+    "        \"\"\"\n",
+    "        if filetype == \"hdf5\":\n",
+    "            # e.g.\n",
+    "            #    {\"input\": [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n",
+    "            #                \"filetype\": \"hdf5\",\n",
+    "            # -> filepath = \"some/path.h5\", key = \"F01_050C0101_PED_REAL\"\n",
+    "            filepath, key = filepath.split(\":\", 1)\n",
+    "\n",
+    "            loader = self._loaders.get(filepath)\n",
+    "            if loader is None:\n",
+    "                # To avoid disk access, create loader only for the first time\n",
+    "                loader = h5py.File(filepath, \"r\")\n",
+    "                self._loaders[filepath] = loader\n",
+    "            return loader[key][()]\n",
+    "        elif filetype == \"sound.hdf5\":\n",
+    "            # e.g.\n",
+    "            #    {\"input\": [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n",
+    "            #                \"filetype\": \"sound.hdf5\",\n",
+    "            # -> filepath = \"some/path.h5\", key = \"F01_050C0101_PED_REAL\"\n",
+    "            filepath, key = filepath.split(\":\", 1)\n",
+    "\n",
+    "            loader = self._loaders.get(filepath)\n",
+    "            if loader is None:\n",
+    "                # To avoid disk access, create loader only for the first time\n",
+    "                loader = SoundHDF5File(filepath, \"r\", dtype=\"int16\")\n",
+    "                self._loaders[filepath] = loader\n",
+    "            array, rate = loader[key]\n",
+    "            return array\n",
+    "        elif filetype == \"sound\":\n",
+    "            # e.g.\n",
+    "            #    {\"input\": [{\"feat\": \"some/path.wav\",\n",
+    "            #                \"filetype\": \"sound\"},\n",
+    "            # Assume PCM16\n",
+    "            if not self.keep_all_data_on_mem:\n",
+    "                array, _ = soundfile.read(filepath, dtype=\"int16\")\n",
+    "                return array\n",
+    "            if filepath not in self._loaders:\n",
+    "                array, _ = soundfile.read(filepath, dtype=\"int16\")\n",
+    "                self._loaders[filepath] = array\n",
+    "            return self._loaders[filepath]\n",
+    "        elif filetype == \"npz\":\n",
+    "            # e.g.\n",
+    "            #    {\"input\": [{\"feat\": \"some/path.npz:F01_050C0101_PED_REAL\",\n",
+    "            #                \"filetype\": \"npz\",\n",
+    "            filepath, key = filepath.split(\":\", 1)\n",
+    "\n",
+    "            loader = self._loaders.get(filepath)\n",
+    "            if loader is None:\n",
+    "                # To avoid disk access, create loader only for the first time\n",
+    "                loader = np.load(filepath)\n",
+    "                self._loaders[filepath] = loader\n",
+    "            return loader[key]\n",
+    "        elif filetype == \"npy\":\n",
+    "            # e.g.\n",
+    "            #    {\"input\": [{\"feat\": \"some/path.npy\",\n",
+    "            #                \"filetype\": \"npy\"},\n",
+    "            if not self.keep_all_data_on_mem:\n",
+    "                return np.load(filepath)\n",
+    "            if filepath not in self._loaders:\n",
+    "                self._loaders[filepath] = np.load(filepath)\n",
+    "            return self._loaders[filepath]\n",
+    "        elif filetype in [\"mat\", \"vec\"]:\n",
+    "            # e.g.\n",
+    "            #    {\"input\": [{\"feat\": \"some/path.ark:123\",\n",
+    "            #                \"filetype\": \"mat\"}]},\n",
+    "            # In this case, \"123\" indicates the starting points of the matrix\n",
+    "            # load_mat can load both matrix and vector\n",
+    "            if not self.keep_all_data_on_mem:\n",
+    "                return kaldiio.load_mat(filepath)\n",
+    "            if filepath not in self._loaders:\n",
+    "                self._loaders[filepath] = kaldiio.load_mat(filepath)\n",
+    "            return self._loaders[filepath]\n",
+    "        elif filetype == \"scp\":\n",
+    "            # e.g.\n",
+    "            #    {\"input\": [{\"feat\": \"some/path.scp:F01_050C0101_PED_REAL\",\n",
+    "            #                \"filetype\": \"scp\",\n",
+    "            filepath, key = filepath.split(\":\", 1)\n",
+    "            loader = self._loaders.get(filepath)\n",
+    "            if loader is None:\n",
+    "                # To avoid disk access, create loader only for the first time\n",
+    "                loader = kaldiio.load_scp(filepath)\n",
+    "                self._loaders[filepath] = loader\n",
+    "            return loader[key]\n",
+    "        else:\n",
+    "            raise NotImplementedError(\n",
+    "                \"Not supported: loader_type={}\".format(filetype))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "monthly-muscle",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocess_conf=None\n",
+    "train_mode=True\n",
+    "load = LoadInputsAndTargets(\n",
+    "            mode=\"asr\",\n",
+    "            load_output=True,\n",
+    "            preprocess_conf=preprocess_conf,\n",
+    "            preprocess_args={\"train\":\n",
+    "                             train_mode},  # Switch the mode of preprocessing\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "periodic-senegal",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-56-9f483b231463>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdev_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m<ipython-input-54-9deb677b23d5>\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, batch, return_uttid)\u001b[0m\n\u001b[1;32m     94\u001b[0m                     x = self._get_from_loader(\n\u001b[1;32m     95\u001b[0m                         \u001b[0mfilepath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"feat\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m                         filetype=inp.get(\"filetype\", \"mat\"))\n\u001b[0m\u001b[1;32m     97\u001b[0m                     \u001b[0mx_feats_dict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     98\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m<ipython-input-54-9deb677b23d5>\u001b[0m in \u001b[0;36m_get_from_loader\u001b[0;34m(self, filepath, filetype)\u001b[0m\n\u001b[1;32m    278\u001b[0m             \u001b[0;31m# load_mat can load both matrix and vector\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    279\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeep_all_data_on_mem\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 280\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    281\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mfilepath\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    282\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/matio.py\u001b[0m in \u001b[0;36mload_mat\u001b[0;34m(ark_name, endian, fd_dict)\u001b[0m\n\u001b[1;32m    238\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    239\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m         \u001b[0;32mwith\u001b[0m \u001b[0mopen_like_kaldi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mark\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfd\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    241\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/utils.py\u001b[0m in \u001b[0;36mopen_like_kaldi\u001b[0;34m(name, mode)\u001b[0m\n\u001b[1;32m    206\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    207\u001b[0m         \u001b[0mencoding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdefault_encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'"
+     ]
+    }
+   ],
+   "source": [
+    "res = load(dev_data[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "humanitarian-container",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ls: cannot access '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark': No such file or directory\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls /workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "heard-prize",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ls: cannot access '/workspace/espnet/': No such file or directory\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls /workspace/espnet/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "convinced-animation",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/deepspeech/io/batchfy.py b/deepspeech/io/batchfy.py
index d237eb749..36c1ec31d 100644
--- a/deepspeech/io/batchfy.py
+++ b/deepspeech/io/batchfy.py
@@ -347,7 +347,7 @@ def make_batchset(
     Note that if any utts doesn't have "category",
     perform as same as batchfy_by_{count}
 
-    :param Dict[str, Dict[str, Any]] data: dictionary loaded from data.json
+    :param List[Dict[str, Any]] data: dictionary loaded from data.json
     :param int batch_size: maximum number of sequences in a minibatch.
     :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
     :param int batch_frames_in:  maximum number of input frames in a minibatch.
@@ -374,7 +374,6 @@ def make_batchset(
         reserved for future research, -1 means all axis.)
     :return: List[List[Tuple[str, dict]]] list of batches
     """
-
     # check args
     if count not in BATCH_COUNT_CHOICES:
         raise ValueError(
@@ -386,7 +385,6 @@ def make_batchset(
     ikey = "input"
     okey = "output"
     batch_sort_axis = 0  # index of list 
-
     if count == "auto":
         if batch_size != 0:
             count = "seq"
@@ -405,7 +403,8 @@ def make_batchset(
             "batch_sort_key=shuffle is only available if batch_count=seq")
 
     category2data = {}  # Dict[str, dict]
-    for k, v in data.items():
+    for v in data:
+        k = v['utt']
         category2data.setdefault(v.get("category"), {})[k] = v
 
     batches_list = []  # List[List[List[Tuple[str, dict]]]]
@@ -422,6 +421,7 @@ def make_batchset(
             key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
             reverse=not shortest_first, )
         logger.info("# utts: " + str(len(sorted_data)))
+        
         if count == "seq":
             batches = batchfy_by_seq(
                 sorted_data,
@@ -466,4 +466,4 @@ def make_batchset(
     logger.info("# minibatches: " + str(len(batches)))
 
     # batch: List[List[Tuple[str, dict]]]
-    return batches
+    return batches
\ No newline at end of file
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index e2db93404..a30666b4e 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -16,7 +16,7 @@ from typing import Optional
 from paddle.io import Dataset
 from yacs.config import CfgNode
 
-from deepspeech.frontend.utility import read_manifest
+
 from deepspeech.utils.log import Log
 
 __all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]

From d43600ed266eff8b28a2e1a4b9ac9f360bb17597 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 17 Aug 2021 07:34:54 +0000
Subject: [PATCH 07/17] espnet loader test

---
 .notebook/espnet_dataloader.ipynb | 296 ++++++++++++++++++++++++------
 requirements.txt                  |   1 +
 2 files changed, 237 insertions(+), 60 deletions(-)

diff --git a/.notebook/espnet_dataloader.ipynb b/.notebook/espnet_dataloader.ipynb
index 5d1829794..12870a8eb 100644
--- a/.notebook/espnet_dataloader.ipynb
+++ b/.notebook/espnet_dataloader.ipynb
@@ -10,13 +10,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/workspace/DeepSpeech-2.x\n"
+      "/workspace/zhanghui/DeepSpeech-2.x\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "'/workspace/DeepSpeech-2.x'"
+       "'/workspace/zhanghui/DeepSpeech-2.x'"
       ]
      },
      "execution_count": 1,
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
    "id": "correct-window",
    "metadata": {},
    "outputs": [
@@ -45,22 +45,22 @@
     }
    ],
    "source": [
-    "!ls /workspace/DeepSpeech-2.x/examples/librispeech/s2/data/"
+    "!ls /workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
    "id": "exceptional-cheese",
    "metadata": {},
    "outputs": [],
    "source": [
-    "dev_data='/workspace/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'"
+    "dev_data='/workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
    "id": "extraordinary-orleans",
    "metadata": {},
    "outputs": [
@@ -68,6 +68,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n",
       "register user softmax to paddle, remove this when fixed!\n",
       "register user log_softmax to paddle, remove this when fixed!\n",
       "register user sigmoid to paddle, remove this when fixed!\n",
@@ -105,26 +106,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
    "id": "returning-lighter",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "dev_json = read_manifest(dev_data)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "id": "western-founder",
    "metadata": {},
    "outputs": [
@@ -166,7 +158,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 14,
    "id": "motivated-receptor",
    "metadata": {},
    "outputs": [],
@@ -646,19 +638,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 15,
    "id": "acquired-hurricane",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:284] use shuffled batch.\n",
-      "[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:286] # utts: 5542\n",
-      "[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:467] # minibatches: 555\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -703,7 +686,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 16,
    "id": "warming-malpractice",
    "metadata": {},
    "outputs": [
@@ -713,16 +696,16 @@
      "text": [
       "Collecting kaldiio\n",
       "  Downloading kaldiio-2.17.2.tar.gz (24 kB)\n",
-      "Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages (from kaldiio) (1.20.1)\n",
+      "Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages/numpy-1.21.2-py3.7-linux-x86_64.egg (from kaldiio) (1.21.2)\n",
       "Building wheels for collected packages: kaldiio\n",
       "  Building wheel for kaldiio (setup.py) ... \u001b[?25ldone\n",
-      "\u001b[?25h  Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24469 sha256=aadc8b1a8de5c9769af065ae724fb11326691d2350145019f6e3dba69f020134\n",
+      "\u001b[?25h  Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24468 sha256=cd6e066764dcc8c24a9dfe3f7bd8acda18761a6fbcb024995729da8debdb466e\n",
       "  Stored in directory: /root/.cache/pip/wheels/04/07/e8/45641287c59bf6ce41e22259f8680b521c31e6306cb88392ac\n",
       "Successfully built kaldiio\n",
       "Installing collected packages: kaldiio\n",
       "Successfully installed kaldiio-2.17.2\n",
-      "\u001b[33mWARNING: You are using pip version 20.0.1; however, version 21.2.4 is available.\n",
-      "You should consider upgrading via the '/workspace/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+      "\u001b[33mWARNING: You are using pip version 20.3.3; however, version 21.2.4 is available.\n",
+      "You should consider upgrading via the '/workspace/zhanghui/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
      ]
     }
    ],
@@ -740,7 +723,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 19,
    "id": "superb-methodology",
    "metadata": {},
    "outputs": [],
@@ -1046,7 +1029,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 20,
    "id": "monthly-muscle",
    "metadata": {},
    "outputs": [],
@@ -1064,70 +1047,263 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 23,
    "id": "periodic-senegal",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = load(dev_data[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "7f0307eb",
+   "metadata": {},
    "outputs": [
     {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-56-9f483b231463>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdev_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m<ipython-input-54-9deb677b23d5>\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, batch, return_uttid)\u001b[0m\n\u001b[1;32m     94\u001b[0m                     x = self._get_from_loader(\n\u001b[1;32m     95\u001b[0m                         \u001b[0mfilepath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"feat\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m                         filetype=inp.get(\"filetype\", \"mat\"))\n\u001b[0m\u001b[1;32m     97\u001b[0m                     \u001b[0mx_feats_dict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     98\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m<ipython-input-54-9deb677b23d5>\u001b[0m in \u001b[0;36m_get_from_loader\u001b[0;34m(self, filepath, filetype)\u001b[0m\n\u001b[1;32m    278\u001b[0m             \u001b[0;31m# load_mat can load both matrix and vector\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    279\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeep_all_data_on_mem\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 280\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    281\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mfilepath\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    282\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/matio.py\u001b[0m in \u001b[0;36mload_mat\u001b[0;34m(ark_name, endian, fd_dict)\u001b[0m\n\u001b[1;32m    238\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    239\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m         \u001b[0;32mwith\u001b[0m \u001b[0mopen_like_kaldi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mark\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfd\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    241\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/utils.py\u001b[0m in \u001b[0;36mopen_like_kaldi\u001b[0;34m(name, mode)\u001b[0m\n\u001b[1;32m    206\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    207\u001b[0m         \u001b[0mencoding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdefault_encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'tuple'>\n",
+      "2\n",
+      "10\n",
+      "10\n",
+      "(1763, 83) float32\n",
+      "(73,) int64\n"
      ]
     }
    ],
    "source": [
-    "res = load(dev_data[0])"
+    "print(type(res))\n",
+    "print(len(res))\n",
+    "print(len(res[0]))\n",
+    "print(len(res[1]))\n",
+    "print(res[0][0].shape, res[0][0].dtype)\n",
+    "print(res[1][0].shape, res[1][0].dtype)\n",
+    "# Tuple[Tuple[np.ndarry], Tuple[np.ndarry]]\n",
+    "# 2[10, 10]\n",
+    "# feats, labels"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 36,
    "id": "humanitarian-container",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "(inputs, outputs), utts = load(dev_data[0], return_uttid=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "heard-prize",
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ls: cannot access '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark': No such file or directory\r\n"
+      "['1673-143396-0008', '1650-173552-0000', '2803-154320-0000', '6267-65525-0045', '7641-96684-0029', '5338-284437-0010', '8173-294714-0033', '5543-27761-0047', '8254-115543-0043', '6467-94831-0038'] 10\n",
+      "10\n"
      ]
     }
    ],
    "source": [
-    "!ls /workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark"
+    "print(utts, len(utts))\n",
+    "print(len(inputs))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
-   "id": "heard-prize",
+   "execution_count": 83,
+   "id": "convinced-animation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "from deepspeech.io.utility import pad_list\n",
+    "class CustomConverter():\n",
+    "    \"\"\"Custom batch converter.\n",
+    "\n",
+    "    Args:\n",
+    "        subsampling_factor (int): The subsampling factor.\n",
+    "        dtype (paddle.dtype): Data type to convert.\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, subsampling_factor=1, dtype=np.float32):\n",
+    "        \"\"\"Construct a CustomConverter object.\"\"\"\n",
+    "        self.subsampling_factor = subsampling_factor\n",
+    "        self.ignore_id = -1\n",
+    "        self.dtype = dtype\n",
+    "\n",
+    "    def __call__(self, batch):\n",
+    "        \"\"\"Transform a batch and send it to a device.\n",
+    "\n",
+    "        Args:\n",
+    "            batch (list): The batch to transform.\n",
+    "\n",
+    "        Returns:\n",
+    "            tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)\n",
+    "\n",
+    "        \"\"\"\n",
+    "        # batch should be located in list\n",
+    "        assert len(batch) == 1\n",
+    "        (xs, ys), utts = batch[0]\n",
+    "\n",
+    "        # perform subsampling\n",
+    "        if self.subsampling_factor > 1:\n",
+    "            xs = [x[::self.subsampling_factor, :] for x in xs]\n",
+    "\n",
+    "        # get batch of lengths of input sequences\n",
+    "        ilens = np.array([x.shape[0] for x in xs])\n",
+    "\n",
+    "        # perform padding and convert to tensor\n",
+    "        # currently only support real number\n",
+    "        if xs[0].dtype.kind == \"c\":\n",
+    "            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)\n",
+    "            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)\n",
+    "            # Note(kamo):\n",
+    "            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.\n",
+    "            # Don't create ComplexTensor and give it E2E here\n",
+    "            # because torch.nn.DataParellel can't handle it.\n",
+    "            xs_pad = {\"real\": xs_pad_real, \"imag\": xs_pad_imag}\n",
+    "        else:\n",
+    "            xs_pad = pad_list(xs, 0).astype(self.dtype)\n",
+    "\n",
+    "        # NOTE: this is for multi-output (e.g., speech translation)\n",
+    "        ys_pad = pad_list(\n",
+    "            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],\n",
+    "            self.ignore_id)\n",
+    "\n",
+    "        olens = np.array([y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])\n",
+    "        return utts, xs_pad, ilens, ys_pad, olens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "1b6508fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "convert = CustomConverter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "25d655c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "utts, xs, ilen, ys, olen = convert([load(dev_data[0], return_uttid=True)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "a28e5141",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ls: cannot access '/workspace/espnet/': No such file or directory\r\n"
+      "['1673-143396-0008', '1650-173552-0000', '2803-154320-0000', '6267-65525-0045', '7641-96684-0029', '5338-284437-0010', '8173-294714-0033', '5543-27761-0047', '8254-115543-0043', '6467-94831-0038']\n",
+      "(10, 1763, 83)\n",
+      "(10,)\n",
+      "[1763 1214 1146  757  751  661  625  512  426  329]\n",
+      "(10, 73)\n",
+      "[[2896  621 4502 2176  404  198 3538  391  278  407  389 3719 4577  846\n",
+      "  4501  482 1004  103  116  178 4222  624 4689  176  459   89  101 3465\n",
+      "  3204 4502 2029 1834 2298  829 3366  278 4705 4925  482 2920 3204 2481\n",
+      "   448  627 1254  404   20  202   36 2047  627 2495 4504  481  479   99\n",
+      "    18 2079 4502 1628  202  226 4512 3267  210  278  483  234  367 4502\n",
+      "  2438 3204 1141]\n",
+      " [ 742 4501 4768 4569  742 4483 2495 4502 3040 3204 4502 3961 3204 3992\n",
+      "  3089 4832 4258  621 2391 4642 3218 4502 3439  235  270  313 2385 2833\n",
+      "   742 4502 3282  332    3  280 4237 3252  830 2387   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2099  278 4904 2302  124 4832 3158  482 2888 2495  482 2450  627 1560\n",
+      "  3158 4729  482 3514 3204 1027 3233 2391 2862  399  389 4962 2495  121\n",
+      "   221    7 2340 1216 1658   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2458 2659 1362    2  404 4975 4995  487 3079 2785 2371 3158  824 2603\n",
+      "  4832 2323  999 2603 4832 4156 4678  627 1784   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2458 2340 1661  101 4723 2138 4502 4690  463  332  251 2345 4534 4502\n",
+      "  2396  444 4501 2287  389 4531 4894 1466  959  389 1658 2584 4502 3681\n",
+      "   279 3204 4502 2228 3204 4502 4690  463  332  251   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2368 1248  208 4832 3158  482 1473 3401  999  482 4159 3838  389  478\n",
+      "  4572  404 3158 3063 1481  113 4499 4501 3204 4643    2  389 4111   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2882 2932 4329 1808 4577 4350 4577  482 1636    2  389 1841 3204 3079\n",
+      "  1091  389 3204 2816 2079 4172 4986 4990   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [4869 2598 2603 1976   96  389  478    3 4031  721 4925 2263 1259 2598\n",
+      "  4508  653 4979 4925 2741  252   72  236   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2458 4447 4505  713  624 3207  206 4577 4502 2404 3837 3458 2812 4936\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [1501 3897 2537  278 2601    2  404 2603  482 2235 3388   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]]\n",
+      "[73 38 33 23 38 27 22 22 14 11]\n",
+      "float32\n",
+      "int64\n",
+      "int64\n",
+      "int64\n"
      ]
     }
    ],
    "source": [
-    "!ls /workspace/espnet/"
+    "print(utts)\n",
+    "print(xs.shape)\n",
+    "print(ilen.shape)\n",
+    "print(ilen)\n",
+    "print(ys.shape)\n",
+    "print(ys)\n",
+    "print(olen)\n",
+    "print(xs.dtype)\n",
+    "print(ilen.dtype)\n",
+    "print(ys.dtype)\n",
+    "print(olen.dtype)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "convinced-animation",
+   "id": "1d981df4",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -1135,7 +1311,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/requirements.txt b/requirements.txt
index baaa9ba9b..692f34994 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ tensorboardX
 textgrid
 typeguard
 yacs
+kaldiio

From 888c5dc2c43c6ca4768b3a6f9053777fe19f3139 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 17 Aug 2021 07:36:02 +0000
Subject: [PATCH 08/17] fix dataloader

---
 deepspeech/io/batchfy.py    |  4 +-
 deepspeech/io/collator.py   | 15 +++----
 deepspeech/io/dataloader.py | 18 ++++----
 deepspeech/io/dataset.py    |  1 -
 deepspeech/io/utility.py    | 90 +++++++++++++++++++++++++++++++++++++
 5 files changed, 108 insertions(+), 20 deletions(-)

diff --git a/deepspeech/io/batchfy.py b/deepspeech/io/batchfy.py
index 36c1ec31d..54c6f0e14 100644
--- a/deepspeech/io/batchfy.py
+++ b/deepspeech/io/batchfy.py
@@ -421,7 +421,7 @@ def make_batchset(
             key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
             reverse=not shortest_first, )
         logger.info("# utts: " + str(len(sorted_data)))
-        
+
         if count == "seq":
             batches = batchfy_by_seq(
                 sorted_data,
@@ -466,4 +466,4 @@ def make_batchset(
     logger.info("# minibatches: " + str(len(batches)))
 
     # batch: List[List[Tuple[str, dict]]]
-    return batches
\ No newline at end of file
+    return batches
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index 2ef119666..4900350e2 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -23,7 +23,7 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
-from deepspeech.io.utility import pad_sequence
+from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log
 
 __all__ = ["SpeechCollator"]
@@ -286,13 +286,12 @@ class SpeechCollator():
             texts.append(tokens)
             text_lens.append(tokens.shape[0])
 
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_texts = pad_sequence(
-            texts, padding_value=IGNORE_ID).astype(np.int64)
-        text_lens = np.array(text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, padded_texts, text_lens
+        #[B, T, D]
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)
+        ilens = np.array(audio_lens).astype(np.int64)
+        ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64)
+        olens = np.array(text_lens).astype(np.int64)
+        return utts, xs_pad, ilens, ys_pad, olens
 
     @property
     def manifest(self):
diff --git a/deepspeech/io/dataloader.py b/deepspeech/io/dataloader.py
index 0c5034caa..2e6b6a027 100644
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
 from paddle.io import DataLoader
 
 from deepspeech.frontend.utility import read_manifest
@@ -30,11 +31,11 @@ class CustomConverter():
 
     Args:
         subsampling_factor (int): The subsampling factor.
-        dtype (paddle.dtype): Data type to convert.
-
+        dtype (np.dtype): Data type to convert.
+        
     """
 
-    def __init__(self, subsampling_factor=1, dtype=paddle.float32):
+    def __init__(self, subsampling_factor=1, dtype=np.float32):
         """Construct a CustomConverter object."""
         self.subsampling_factor = subsampling_factor
         self.ignore_id = -1
@@ -52,7 +53,7 @@ class CustomConverter():
         """
         # batch should be located in list
         assert len(batch) == 1
-        xs, ys = batch[0]
+        (xs, ys), utts = batch[0]
 
         # perform subsampling
         if self.subsampling_factor > 1:
@@ -74,15 +75,14 @@ class CustomConverter():
         else:
             xs_pad = pad_list(xs, 0).astype(self.dtype)
 
-        ilens = paddle.to_tensor(ilens)
-
         # NOTE: this is for multi-output (e.g., speech translation)
         ys_pad = pad_list(
             [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
             self.ignore_id)
 
-        olens = np.array([y.shape[0] for y in ys])
-        return xs_pad, ilens, ys_pad, olens
+        olens = np.array(
+            [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
+        return utts, xs_pad, ilens, ys_pad, olens
 
 
 class BatchDataLoader():
@@ -166,7 +166,7 @@ class BatchDataLoader():
         # we used an empty collate function instead which returns list
         self.train_loader = DataLoader(
             dataset=TransformDataset(
-                self.data, lambda data: self.converter([self.load(data)])),
+                self.data, lambda data: self.converter([self.load(data, return_uttid=True)])),
             batch_size=1,
             shuffle=not use_sortagrad if train_mode else False,
             collate_fn=lambda x: x[0],
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index a30666b4e..c5b6e7376 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -16,7 +16,6 @@ from typing import Optional
 from paddle.io import Dataset
 from yacs.config import CfgNode
 
-
 from deepspeech.utils.log import Log
 
 __all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
diff --git a/deepspeech/io/utility.py b/deepspeech/io/utility.py
index 915813f3a..91abdf088 100644
--- a/deepspeech/io/utility.py
+++ b/deepspeech/io/utility.py
@@ -14,7 +14,9 @@
 from collections import OrderedDict
 from typing import List
 
+import kaldiio
 import numpy as np
+import soundfile
 
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.utils.log import Log
@@ -383,3 +385,91 @@ class LoadInputsAndTargets():
         else:
             raise NotImplementedError(
                 "Not supported: loader_type={}".format(filetype))
+
+
+class SoundHDF5File():
+    """Collecting sound files to a HDF5 file
+
+    >>> f = SoundHDF5File('a.flac.h5', mode='a')
+    >>> array = np.random.randint(0, 100, 100, dtype=np.int16)
+    >>> f['id'] = (array, 16000)
+    >>> array, rate = f['id']
+
+
+    :param: str filepath:
+    :param: str mode:
+    :param: str format: The type used when saving wav. flac, nist, htk, etc.
+    :param: str dtype:
+
+    """
+
+    def __init__(self,
+                 filepath,
+                 mode="r+",
+                 format=None,
+                 dtype="int16",
+                 **kwargs):
+        self.filepath = filepath
+        self.mode = mode
+        self.dtype = dtype
+
+        self.file = h5py.File(filepath, mode, **kwargs)
+        if format is None:
+            # filepath = a.flac.h5 -> format = flac
+            second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1]
+            format = second_ext[1:]
+            if format.upper() not in soundfile.available_formats():
+                # If not found, flac is selected
+                format = "flac"
+
+        # This format affects only saving
+        self.format = format
+
+    def __repr__(self):
+        return '<SoundHDF5 file "{}" (mode {}, format {}, type {})>'.format(
+            self.filepath, self.mode, self.format, self.dtype)
+
+    def create_dataset(self, name, shape=None, data=None, **kwds):
+        f = io.BytesIO()
+        array, rate = data
+        soundfile.write(f, array, rate, format=self.format)
+        self.file.create_dataset(
+            name, shape=shape, data=np.void(f.getvalue()), **kwds)
+
+    def __setitem__(self, name, data):
+        self.create_dataset(name, data=data)
+
+    def __getitem__(self, key):
+        data = self.file[key][()]
+        f = io.BytesIO(data.tobytes())
+        array, rate = soundfile.read(f, dtype=self.dtype)
+        return array, rate
+
+    def keys(self):
+        return self.file.keys()
+
+    def values(self):
+        for k in self.file:
+            yield self[k]
+
+    def items(self):
+        for k in self.file:
+            yield k, self[k]
+
+    def __iter__(self):
+        return iter(self.file)
+
+    def __contains__(self, item):
+        return item in self.file
+
+    def __len__(self, item):
+        return len(self.file)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def close(self):
+        self.file.close()

From 7e44275da39b4a4cc680821c4b87e63a60e0aee8 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 17 Aug 2021 08:48:45 +0000
Subject: [PATCH 09/17] refactor augmentation interface

---
 deepspeech/frontend/augmentor/augmentation.py | 176 ++++++++++++------
 deepspeech/frontend/augmentor/base.py         |   4 +
 .../frontend/augmentor/impulse_response.py    |   5 +
 .../frontend/augmentor/noise_perturb.py       |   5 +
 .../online_bayesian_normalization.py          |   5 +
 deepspeech/frontend/augmentor/resample.py     |   5 +
 .../frontend/augmentor/shift_perturb.py       |   5 +
 deepspeech/frontend/augmentor/spec_augment.py |   5 +
 .../frontend/augmentor/speed_perturb.py       |   5 +
 .../frontend/augmentor/volume_perturb.py      |   5 +
 deepspeech/io/dataset.py                      |   1 +
 requirements.txt                              |   2 +-
 12 files changed, 160 insertions(+), 63 deletions(-)

diff --git a/deepspeech/frontend/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py
index cc0564daf..a61ca37b8 100644
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@@ -13,18 +13,27 @@
 # limitations under the License.
 """Contains the data augmentation pipeline."""
 import json
+from collections.abc import Sequence
+from inspect import signature
 
 import numpy as np
 
-from deepspeech.frontend.augmentor.impulse_response import ImpulseResponseAugmentor
-from deepspeech.frontend.augmentor.noise_perturb import NoisePerturbAugmentor
-from deepspeech.frontend.augmentor.online_bayesian_normalization import \
-    OnlineBayesianNormalizationAugmentor
-from deepspeech.frontend.augmentor.resample import ResampleAugmentor
-from deepspeech.frontend.augmentor.shift_perturb import ShiftPerturbAugmentor
-from deepspeech.frontend.augmentor.spec_augment import SpecAugmentor
-from deepspeech.frontend.augmentor.speed_perturb import SpeedPerturbAugmentor
-from deepspeech.frontend.augmentor.volume_perturb import VolumePerturbAugmentor
+from deepspeech.utils.dynamic_import import dynamic_import
+from deepspeech.utils.log import Log
+
+__all__ = ["AugmentationPipeline"]
+
+logger = Log(__name__).getlog()
+
+import_alias = dict(
+    volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
+    shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
+    speed="deepspeech.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor",
+    resample="deepspeech.frontend.augmentor.resample:ResampleAugmentor",
+    bayesian_normal="deepspeech.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor",
+    noise="deepspeech.frontend.augmentor.noise_perturb:NoisePerturbAugmentor",
+    impulse="deepspeech.frontend.augmentor.impulse_response:ImpulseResponseAugmentor",
+    specaug="deepspeech.frontend.augmentor.spec_augment:SpecAugmentor", )
 
 
 class AugmentationPipeline():
@@ -78,20 +87,74 @@ class AugmentationPipeline():
     augmentor to take effect. If "prob" is zero, the augmentor does not take
     effect.
 
-    :param augmentation_config: Augmentation configuration in json string.
-    :type augmentation_config: str
-    :param random_seed: Random seed.
-    :type random_seed: int
-    :raises ValueError: If the augmentation json config is in incorrect format".
+    Params:
+        augmentation_config(str): Augmentation configuration in json string.
+        random_seed(int): Random seed.
+        train(bool): whether is train mode.
+    
+    Raises:
+        ValueError: If the augmentation json config is in incorrect format".
     """
 
-    def __init__(self, augmentation_config: str, random_seed=0):
+    def __init__(self, augmentation_config: str, random_seed: int=0):
         self._rng = np.random.RandomState(random_seed)
         self._spec_types = ('specaug')
-        self._augmentors, self._rates = self._parse_pipeline_from(
-            augmentation_config, 'audio')
+
+        if augmentation_config is None:
+            self.conf = {}
+        else:
+            self.conf = json.loads(augmentation_config)
+
+        self._augmentors, self._rates = self._parse_pipeline_from('all')
+        self._audio_augmentors, self._audio_rates = self._parse_pipeline_from(
+            'audio')
         self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
-            augmentation_config, 'feature')
+            'feature')
+
+    def __call__(self, xs, uttid_list=None, **kwargs):
+        if not isinstance(xs, Sequence):
+            is_batch = False
+            xs = [xs]
+        else:
+            is_batch = True
+
+        if isinstance(uttid_list, str):
+            uttid_list = [uttid_list for _ in range(len(xs))]
+
+        if self.conf.get("mode", "sequential") == "sequential":
+            for idx, (func, rate) in enumerate(
+                    zip(self._augmentors, self._rates), 0):
+                if self._rng.uniform(0., 1.) >= rate:
+                    continue
+
+                # Derive only the args which the func has
+                try:
+                    param = signature(func).parameters
+                except ValueError:
+                    # Some function, e.g. built-in function, are failed
+                    param = {}
+                _kwargs = {k: v for k, v in kwargs.items() if k in param}
+
+                try:
+                    if uttid_list is not None and "uttid" in param:
+                        xs = [
+                            func(x, u, **_kwargs)
+                            for x, u in zip(xs, uttid_list)
+                        ]
+                    else:
+                        xs = [func(x, **_kwargs) for x in xs]
+                except Exception:
+                    logger.fatal("Catch a exception from {}th func: {}".format(
+                        idx, func))
+                    raise
+        else:
+            raise NotImplementedError(
+                "Not supporting mode={}".format(self.conf["mode"]))
+
+        if is_batch:
+            return xs
+        else:
+            return xs[0]
 
     def transform_audio(self, audio_segment):
         """Run the pre-processing pipeline for data augmentation.
@@ -101,7 +164,9 @@ class AugmentationPipeline():
         :param audio_segment: Audio segment to process.
         :type audio_segment: AudioSegmenet|SpeechSegment
         """
-        for augmentor, rate in zip(self._augmentors, self._rates):
+        if not self._train:
+            return
+        for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
             if self._rng.uniform(0., 1.) < rate:
                 augmentor.transform_audio(audio_segment)
 
@@ -111,57 +176,44 @@ class AugmentationPipeline():
         Args:
             spec_segment (np.ndarray): audio feature, (D, T).
         """
+        if not self._train:
+            return
         for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
             if self._rng.uniform(0., 1.) < rate:
                 spec_segment = augmentor.transform_feature(spec_segment)
         return spec_segment
 
-    def _parse_pipeline_from(self, config_json, aug_type='audio'):
+    def _parse_pipeline_from(self, aug_type='all'):
         """Parse the config json to build a augmentation pipelien."""
-        assert aug_type in ('audio', 'feature'), aug_type
-        try:
-            configs = json.loads(config_json)
-            audio_confs = []
-            feature_confs = []
-            for config in configs:
-                if config["type"] in self._spec_types:
-                    feature_confs.append(config)
-                else:
-                    audio_confs.append(config)
-
-            if aug_type == 'audio':
-                aug_confs = audio_confs
-            elif aug_type == 'feature':
-                aug_confs = feature_confs
-
-            augmentors = [
-                self._get_augmentor(config["type"], config["params"])
-                for config in aug_confs
-            ]
-            rates = [config["prob"] for config in aug_confs]
-
-        except Exception as e:
-            raise ValueError("Failed to parse the augmentation config json: "
-                             "%s" % str(e))
+        assert aug_type in ('audio', 'feature', 'all'), aug_type
+        audio_confs = []
+        feature_confs = []
+        all_confs = []
+        for config in self.conf:
+            all_confs.append(config)
+            if config["type"] in self._spec_types:
+                feature_confs.append(config)
+            else:
+                audio_confs.append(config)
+
+        if aug_type == 'audio':
+            aug_confs = audio_confs
+        elif aug_type == 'feature':
+            aug_confs = feature_confs
+        else:
+            aug_confs = all_confs
+
+        augmentors = [
+            self._get_augmentor(config["type"], config["params"])
+            for config in aug_confs
+        ]
+        rates = [config["prob"] for config in aug_confs]
         return augmentors, rates
 
     def _get_augmentor(self, augmentor_type, params):
         """Return an augmentation model by the type name, and pass in params."""
-        if augmentor_type == "volume":
-            return VolumePerturbAugmentor(self._rng, **params)
-        elif augmentor_type == "shift":
-            return ShiftPerturbAugmentor(self._rng, **params)
-        elif augmentor_type == "speed":
-            return SpeedPerturbAugmentor(self._rng, **params)
-        elif augmentor_type == "resample":
-            return ResampleAugmentor(self._rng, **params)
-        elif augmentor_type == "bayesian_normal":
-            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
-        elif augmentor_type == "noise":
-            return NoisePerturbAugmentor(self._rng, **params)
-        elif augmentor_type == "impulse":
-            return ImpulseResponseAugmentor(self._rng, **params)
-        elif augmentor_type == "specaug":
-            return SpecAugmentor(self._rng, **params)
-        else:
+        class_obj = dynamic_import(augmentor_type, import_alias)
+        try:
+            obj = class_obj(self._rng, **params)
+        except Exception:
             raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
diff --git a/deepspeech/frontend/augmentor/base.py b/deepspeech/frontend/augmentor/base.py
index e6f5c1e9f..87cb4ef72 100644
--- a/deepspeech/frontend/augmentor/base.py
+++ b/deepspeech/frontend/augmentor/base.py
@@ -28,6 +28,10 @@ class AugmentorBase():
     def __init__(self):
         pass
 
+    @abstractmethod
+    def __call__(self, xs):
+        raise NotImplementedError
+
     @abstractmethod
     def transform_audio(self, audio_segment):
         """Adds various effects to the input audio segment. Such effects
diff --git a/deepspeech/frontend/augmentor/impulse_response.py b/deepspeech/frontend/augmentor/impulse_response.py
index fbd617b42..01421fc65 100644
--- a/deepspeech/frontend/augmentor/impulse_response.py
+++ b/deepspeech/frontend/augmentor/impulse_response.py
@@ -30,6 +30,11 @@ class ImpulseResponseAugmentor(AugmentorBase):
         self._rng = rng
         self._impulse_manifest = read_manifest(impulse_manifest_path)
 
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return
+        self.transform_audio(x)
+
     def transform_audio(self, audio_segment):
         """Add impulse response effect.
 
diff --git a/deepspeech/frontend/augmentor/noise_perturb.py b/deepspeech/frontend/augmentor/noise_perturb.py
index b3c07f5c1..11f5ed105 100644
--- a/deepspeech/frontend/augmentor/noise_perturb.py
+++ b/deepspeech/frontend/augmentor/noise_perturb.py
@@ -36,6 +36,11 @@ class NoisePerturbAugmentor(AugmentorBase):
         self._rng = rng
         self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
 
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return
+        self.transform_audio(x)
+
     def transform_audio(self, audio_segment):
         """Add background noise audio.
 
diff --git a/deepspeech/frontend/augmentor/online_bayesian_normalization.py b/deepspeech/frontend/augmentor/online_bayesian_normalization.py
index 5af3b9b03..dc32a1808 100644
--- a/deepspeech/frontend/augmentor/online_bayesian_normalization.py
+++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py
@@ -44,6 +44,11 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase):
         self._rng = rng
         self._startup_delay = startup_delay
 
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return
+        self.transform_audio(x)
+
     def transform_audio(self, audio_segment):
         """Normalizes the input audio using the online Bayesian approach.
 
diff --git a/deepspeech/frontend/augmentor/resample.py b/deepspeech/frontend/augmentor/resample.py
index 9afce635d..a862b184e 100644
--- a/deepspeech/frontend/augmentor/resample.py
+++ b/deepspeech/frontend/augmentor/resample.py
@@ -31,6 +31,11 @@ class ResampleAugmentor(AugmentorBase):
         self._new_sample_rate = new_sample_rate
         self._rng = rng
 
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return
+        self.transform_audio(x)
+
     def transform_audio(self, audio_segment):
         """Resamples the input audio to a target sample rate.
 
diff --git a/deepspeech/frontend/augmentor/shift_perturb.py b/deepspeech/frontend/augmentor/shift_perturb.py
index 9cc3fe2d0..6c78c528e 100644
--- a/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/deepspeech/frontend/augmentor/shift_perturb.py
@@ -31,6 +31,11 @@ class ShiftPerturbAugmentor(AugmentorBase):
         self._max_shift_ms = max_shift_ms
         self._rng = rng
 
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return
+        self.transform_audio(x)
+
     def transform_audio(self, audio_segment):
         """Shift audio.
 
diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py
index 1c2e09fc7..94d23bf46 100644
--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@@ -157,6 +157,11 @@ class SpecAugmentor(AugmentorBase):
             self._time_mask = (t_0, t_0 + t)
         return xs
 
+    def __call__(self, x, train=True):
+        if not train:
+            return
+        self.transform_audio(x)
+
     def transform_feature(self, xs: np.ndarray):
         """
         Args:
diff --git a/deepspeech/frontend/augmentor/speed_perturb.py b/deepspeech/frontend/augmentor/speed_perturb.py
index d0977c131..838c5cc29 100644
--- a/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/deepspeech/frontend/augmentor/speed_perturb.py
@@ -79,6 +79,11 @@ class SpeedPerturbAugmentor(AugmentorBase):
             self._rates = np.linspace(
                 self._min_rate, self._max_rate, self._num_rates, endpoint=True)
 
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return
+        self.transform_audio(x)
+
     def transform_audio(self, audio_segment):
         """Sample a new speed rate from the given range and
         changes the speed of the given audio clip.
diff --git a/deepspeech/frontend/augmentor/volume_perturb.py b/deepspeech/frontend/augmentor/volume_perturb.py
index 0d76e7a05..ffae1693e 100644
--- a/deepspeech/frontend/augmentor/volume_perturb.py
+++ b/deepspeech/frontend/augmentor/volume_perturb.py
@@ -37,6 +37,11 @@ class VolumePerturbAugmentor(AugmentorBase):
         self._max_gain_dBFS = max_gain_dBFS
         self._rng = rng
 
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return
+        self.transform_audio(x)
+
     def transform_audio(self, audio_segment):
         """Change audio loadness.
 
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index c5b6e7376..e2db93404 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -16,6 +16,7 @@ from typing import Optional
 from paddle.io import Dataset
 from yacs.config import CfgNode
 
+from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
 
 __all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
diff --git a/requirements.txt b/requirements.txt
index 692f34994..af2600e0d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 coverage
 gpustat
+kaldiio
 pre-commit
 pybind11
 resampy==0.2.2
@@ -13,4 +14,3 @@ tensorboardX
 textgrid
 typeguard
 yacs
-kaldiio

From 0d3e648aba8a478656ee10b2a38d5b998cec9776 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 17 Aug 2021 08:52:54 +0000
Subject: [PATCH 10/17] refactor speechnn dir

---
 speechnn/{core => examples}/CMakeLists.txt                        | 0
 speechnn/{core/frontend => speechnn}/CMakeLists.txt               | 0
 speechnn/{core => speechnn}/decoder/CMakeLists.txt                | 0
 .../{core/frontend/audio => speechnn/frontend}/CMakeLists.txt     | 0
 .../frontend/text => speechnn/frontend/audio}/CMakeLists.txt      | 0
 speechnn/{core/model => speechnn/frontend/text}/CMakeLists.txt    | 0
 speechnn/{core/protocol => speechnn/model}/CMakeLists.txt         | 0
 speechnn/{core/utils => speechnn/nn}/CMakeLists.txt               | 0
 speechnn/speechnn/protocol/CMakeLists.txt                         | 0
 speechnn/speechnn/utils/CMakeLists.txt                            | 0
 10 files changed, 0 insertions(+), 0 deletions(-)
 rename speechnn/{core => examples}/CMakeLists.txt (100%)
 rename speechnn/{core/frontend => speechnn}/CMakeLists.txt (100%)
 rename speechnn/{core => speechnn}/decoder/CMakeLists.txt (100%)
 rename speechnn/{core/frontend/audio => speechnn/frontend}/CMakeLists.txt (100%)
 rename speechnn/{core/frontend/text => speechnn/frontend/audio}/CMakeLists.txt (100%)
 rename speechnn/{core/model => speechnn/frontend/text}/CMakeLists.txt (100%)
 rename speechnn/{core/protocol => speechnn/model}/CMakeLists.txt (100%)
 rename speechnn/{core/utils => speechnn/nn}/CMakeLists.txt (100%)
 create mode 100644 speechnn/speechnn/protocol/CMakeLists.txt
 create mode 100644 speechnn/speechnn/utils/CMakeLists.txt

diff --git a/speechnn/core/CMakeLists.txt b/speechnn/examples/CMakeLists.txt
similarity index 100%
rename from speechnn/core/CMakeLists.txt
rename to speechnn/examples/CMakeLists.txt
diff --git a/speechnn/core/frontend/CMakeLists.txt b/speechnn/speechnn/CMakeLists.txt
similarity index 100%
rename from speechnn/core/frontend/CMakeLists.txt
rename to speechnn/speechnn/CMakeLists.txt
diff --git a/speechnn/core/decoder/CMakeLists.txt b/speechnn/speechnn/decoder/CMakeLists.txt
similarity index 100%
rename from speechnn/core/decoder/CMakeLists.txt
rename to speechnn/speechnn/decoder/CMakeLists.txt
diff --git a/speechnn/core/frontend/audio/CMakeLists.txt b/speechnn/speechnn/frontend/CMakeLists.txt
similarity index 100%
rename from speechnn/core/frontend/audio/CMakeLists.txt
rename to speechnn/speechnn/frontend/CMakeLists.txt
diff --git a/speechnn/core/frontend/text/CMakeLists.txt b/speechnn/speechnn/frontend/audio/CMakeLists.txt
similarity index 100%
rename from speechnn/core/frontend/text/CMakeLists.txt
rename to speechnn/speechnn/frontend/audio/CMakeLists.txt
diff --git a/speechnn/core/model/CMakeLists.txt b/speechnn/speechnn/frontend/text/CMakeLists.txt
similarity index 100%
rename from speechnn/core/model/CMakeLists.txt
rename to speechnn/speechnn/frontend/text/CMakeLists.txt
diff --git a/speechnn/core/protocol/CMakeLists.txt b/speechnn/speechnn/model/CMakeLists.txt
similarity index 100%
rename from speechnn/core/protocol/CMakeLists.txt
rename to speechnn/speechnn/model/CMakeLists.txt
diff --git a/speechnn/core/utils/CMakeLists.txt b/speechnn/speechnn/nn/CMakeLists.txt
similarity index 100%
rename from speechnn/core/utils/CMakeLists.txt
rename to speechnn/speechnn/nn/CMakeLists.txt
diff --git a/speechnn/speechnn/protocol/CMakeLists.txt b/speechnn/speechnn/protocol/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechnn/speechnn/utils/CMakeLists.txt b/speechnn/speechnn/utils/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb

From 8a2ce655f685e07c35455d39f9d6ee1daa83ed1e Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 17 Aug 2021 09:02:17 +0000
Subject: [PATCH 11/17] refactor io

---
 deepspeech/io/dataloader.py |  66 +-----
 deepspeech/io/dataset.py    |  83 +++++++-
 deepspeech/io/reader.py     | 409 ++++++++++++++++++++++++++++++++++++
 deepspeech/io/utility.py    | 390 +---------------------------------
 4 files changed, 489 insertions(+), 459 deletions(-)
 create mode 100644 deepspeech/io/reader.py

diff --git a/deepspeech/io/dataloader.py b/deepspeech/io/dataloader.py
index 2e6b6a027..b993d9a1a 100644
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
@@ -11,80 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
 from paddle.io import DataLoader
 
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.io.batchfy import make_batchset
+from deepspeech.io.dataset import CustomConverter
 from deepspeech.io.dataset import TransformDataset
-from deepspeech.io.utility import LoadInputsAndTargets
-from deepspeech.io.utility import pad_list
+from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.utils.log import Log
 
-__all__ = ["CustomConverter", "BatchDataLoader"]
+__all__ = ["BatchDataLoader"]
 
 logger = Log(__name__).getlog()
 
 
-class CustomConverter():
-    """Custom batch converter.
-
-    Args:
-        subsampling_factor (int): The subsampling factor.
-        dtype (np.dtype): Data type to convert.
-        
-    """
-
-    def __init__(self, subsampling_factor=1, dtype=np.float32):
-        """Construct a CustomConverter object."""
-        self.subsampling_factor = subsampling_factor
-        self.ignore_id = -1
-        self.dtype = dtype
-
-    def __call__(self, batch):
-        """Transform a batch and send it to a device.
-
-        Args:
-            batch (list): The batch to transform.
-
-        Returns:
-            tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
-
-        """
-        # batch should be located in list
-        assert len(batch) == 1
-        (xs, ys), utts = batch[0]
-
-        # perform subsampling
-        if self.subsampling_factor > 1:
-            xs = [x[::self.subsampling_factor, :] for x in xs]
-
-        # get batch of lengths of input sequences
-        ilens = np.array([x.shape[0] for x in xs])
-
-        # perform padding and convert to tensor
-        # currently only support real number
-        if xs[0].dtype.kind == "c":
-            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
-            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
-            # Note(kamo):
-            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
-            # Don't create ComplexTensor and give it E2E here
-            # because torch.nn.DataParellel can't handle it.
-            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
-        else:
-            xs_pad = pad_list(xs, 0).astype(self.dtype)
-
-        # NOTE: this is for multi-output (e.g., speech translation)
-        ys_pad = pad_list(
-            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
-            self.ignore_id)
-
-        olens = np.array(
-            [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
-        return utts, xs_pad, ilens, ys_pad, olens
-
-
 class BatchDataLoader():
     def __init__(self,
                  json_file: str,
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index e2db93404..a7bf1fc24 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -17,9 +17,13 @@ from paddle.io import Dataset
 from yacs.config import CfgNode
 
 from deepspeech.frontend.utility import read_manifest
+from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log
 
-__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
+__all__ = [
+    "ManifestDataset", "TripletManifestDataset", "TransformDataset",
+    "CustomConverter"
+]
 
 logger = Log(__name__).getlog()
 
@@ -76,12 +80,18 @@ class ManifestDataset(Dataset):
 
         Args:
             manifest_path (str): manifest josn file path
-            max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
-            min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
-            max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
-            min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
-            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
-            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
+            max_input_len ([type], optional): maximum output seq length, 
+                in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
+            min_input_len (float, optional): minimum input seq length, 
+                in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
+            max_output_len (float, optional): maximum input seq length, 
+                in modeling units. Defaults to 500.0.
+            min_output_len (float, optional): minimum input seq length, 
+                in modeling units. Defaults to 0.0.
+            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. 
+                Defaults to 10.0.
+            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
+                Defaults to 0.05.
         
         """
         super().__init__()
@@ -118,6 +128,65 @@ class TripletManifestDataset(ManifestDataset):
             "text1"]
 
 
+class CustomConverter():
+    """Custom batch converter.
+
+    Args:
+        subsampling_factor (int): The subsampling factor.
+        dtype (np.dtype): Data type to convert.
+        
+    """
+
+    def __init__(self, subsampling_factor=1, dtype=np.float32):
+        """Construct a CustomConverter object."""
+        self.subsampling_factor = subsampling_factor
+        self.ignore_id = -1
+        self.dtype = dtype
+
+    def __call__(self, batch):
+        """Transform a batch and send it to a device.
+
+        Args:
+            batch (list): The batch to transform.
+
+        Returns:
+            tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
+
+        """
+        # batch should be located in list
+        assert len(batch) == 1
+        (xs, ys), utts = batch[0]
+
+        # perform subsampling
+        if self.subsampling_factor > 1:
+            xs = [x[::self.subsampling_factor, :] for x in xs]
+
+        # get batch of lengths of input sequences
+        ilens = np.array([x.shape[0] for x in xs])
+
+        # perform padding and convert to tensor
+        # currently only support real number
+        if xs[0].dtype.kind == "c":
+            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
+            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
+            # Note(kamo):
+            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
+            # Don't create ComplexTensor and give it E2E here
+            # because torch.nn.DataParellel can't handle it.
+            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
+        else:
+            xs_pad = pad_list(xs, 0).astype(self.dtype)
+
+        # NOTE: this is for multi-output (e.g., speech translation)
+        ys_pad = pad_list(
+            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
+            self.ignore_id)
+
+        olens = np.array(
+            [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
+        return utts, xs_pad, ilens, ys_pad, olens
+
+
 class TransformDataset(Dataset):
     """Transform Dataset.
 
diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py
new file mode 100644
index 000000000..b6dc61b79
--- /dev/null
+++ b/deepspeech/io/reader.py
@@ -0,0 +1,409 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import OrderedDict
+
+import kaldiio
+import numpy as np
+import soundfile
+
+from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
+from deepspeech.utils.log import Log
+
+__all__ = ["LoadInputsAndTargets"]
+
+logger = Log(__name__).getlog()
+
+
+class LoadInputsAndTargets():
+    """Create a mini-batch from a list of dicts
+
+    >>> batch = [('utt1',
+    ...           dict(input=[dict(feat='some.ark:123',
+    ...                            filetype='mat',
+    ...                            name='input1',
+    ...                            shape=[100, 80])],
+    ...                output=[dict(tokenid='1 2 3 4',
+    ...                             name='target1',
+    ...                             shape=[4, 31])]]))
+    >>> l = LoadInputsAndTargets()
+    >>> feat, target = l(batch)
+
+    :param: str mode: Specify the task mode, "asr" or "tts"
+    :param: str preprocess_conf: The path of a json file for pre-processing
+    :param: bool load_input: If False, not to load the input data
+    :param: bool load_output: If False, not to load the output data
+    :param: bool sort_in_input_length: Sort the mini-batch in descending order
+        of the input length
+    :param: bool use_speaker_embedding: Used for tts mode only
+    :param: bool use_second_target: Used for tts mode only
+    :param: dict preprocess_args: Set some optional arguments for preprocessing
+    :param: Optional[dict] preprocess_args: Used for tts mode only
+    """
+
+    def __init__(
+            self,
+            mode="asr",
+            preprocess_conf=None,
+            load_input=True,
+            load_output=True,
+            sort_in_input_length=True,
+            preprocess_args=None,
+            keep_all_data_on_mem=False, ):
+        self._loaders = {}
+
+        if mode not in ["asr"]:
+            raise ValueError("Only asr are allowed: mode={}".format(mode))
+
+        if preprocess_conf is not None:
+            self.preprocessing = AugmentationPipeline(preprocess_conf)
+            logging.warning(
+                "[Experimental feature] Some preprocessing will be done "
+                "for the mini-batch creation using {}".format(
+                    self.preprocessing))
+        else:
+            # If conf doesn't exist, this function don't touch anything.
+            self.preprocessing = None
+
+        self.mode = mode
+        self.load_output = load_output
+        self.load_input = load_input
+        self.sort_in_input_length = sort_in_input_length
+        if preprocess_args is None:
+            self.preprocess_args = {}
+        else:
+            assert isinstance(preprocess_args, dict), type(preprocess_args)
+            self.preprocess_args = dict(preprocess_args)
+
+        self.keep_all_data_on_mem = keep_all_data_on_mem
+
+    def __call__(self, batch, return_uttid=False):
+        """Function to load inputs and targets from list of dicts
+
+        :param List[Tuple[str, dict]] batch: list of dict which is subset of
+            loaded data.json
+        :param bool return_uttid: return utterance ID information for visualization
+        :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
+        :return: list of input feature sequences
+            [(T_1, D), (T_2, D), ..., (T_B, D)]
+        :rtype: list of float ndarray
+        :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
+        :rtype: list of int ndarray
+
+        """
+        x_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
+        y_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
+        uttid_list = []  # List[str]
+
+        for uttid, info in batch:
+            uttid_list.append(uttid)
+
+            if self.load_input:
+                # Note(kamo): This for-loop is for multiple inputs
+                for idx, inp in enumerate(info["input"]):
+                    # {"input":
+                    #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+                    #    "filetype": "hdf5",
+                    #    "name": "input1", ...}], ...}
+                    x = self._get_from_loader(
+                        filepath=inp["feat"],
+                        filetype=inp.get("filetype", "mat"))
+                    x_feats_dict.setdefault(inp["name"], []).append(x)
+
+            if self.load_output:
+                for idx, inp in enumerate(info["output"]):
+                    if "tokenid" in inp:
+                        # ======= Legacy format for output =======
+                        # {"output": [{"tokenid": "1 2 3 4"}])
+                        x = np.fromiter(
+                            map(int, inp["tokenid"].split()), dtype=np.int64)
+                    else:
+                        # ======= New format =======
+                        # {"input":
+                        #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+                        #    "filetype": "hdf5",
+                        #    "name": "target1", ...}], ...}
+                        x = self._get_from_loader(
+                            filepath=inp["feat"],
+                            filetype=inp.get("filetype", "mat"))
+
+                    y_feats_dict.setdefault(inp["name"], []).append(x)
+
+        if self.mode == "asr":
+            return_batch, uttid_list = self._create_batch_asr(
+                x_feats_dict, y_feats_dict, uttid_list)
+        else:
+            raise NotImplementedError(self.mode)
+
+        if self.preprocessing is not None:
+            # Apply pre-processing all input features
+            for x_name in return_batch.keys():
+                if x_name.startswith("input"):
+                    return_batch[x_name] = self.preprocessing(
+                        return_batch[x_name], uttid_list,
+                        **self.preprocess_args)
+
+        if return_uttid:
+            return tuple(return_batch.values()), uttid_list
+
+        # Doesn't return the names now.
+        return tuple(return_batch.values())
+
+    def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
+        """Create a OrderedDict for the mini-batch
+
+        :param OrderedDict x_feats_dict:
+            e.g. {"input1": [ndarray, ndarray, ...],
+                  "input2": [ndarray, ndarray, ...]}
+        :param OrderedDict y_feats_dict:
+            e.g. {"target1": [ndarray, ndarray, ...],
+                  "target2": [ndarray, ndarray, ...]}
+        :param: List[str] uttid_list:
+            Give uttid_list to sort in the same order as the mini-batch
+        :return: batch, uttid_list
+        :rtype: Tuple[OrderedDict, List[str]]
+        """
+        # handle single-input and multi-input (paralell) asr mode
+        xs = list(x_feats_dict.values())
+
+        if self.load_output:
+            ys = list(y_feats_dict.values())
+            assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))
+
+            # get index of non-zero length samples
+            nonzero_idx = list(
+                filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))
+            for n in range(1, len(y_feats_dict)):
+                nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)
+        else:
+            # Note(kamo): Be careful not to make nonzero_idx to a generator
+            nonzero_idx = list(range(len(xs[0])))
+
+        if self.sort_in_input_length:
+            # sort in input lengths based on the first input
+            nonzero_sorted_idx = sorted(
+                nonzero_idx, key=lambda i: -len(xs[0][i]))
+        else:
+            nonzero_sorted_idx = nonzero_idx
+
+        if len(nonzero_sorted_idx) != len(xs[0]):
+            logging.warning(
+                "Target sequences include empty tokenid (batch {} -> {}).".
+                format(len(xs[0]), len(nonzero_sorted_idx)))
+
+        # remove zero-length samples
+        xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]
+        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]
+
+        x_names = list(x_feats_dict.keys())
+        if self.load_output:
+            ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]
+            y_names = list(y_feats_dict.keys())
+
+            # Keeping x_name and y_name, e.g. input1, for future extension
+            return_batch = OrderedDict([
+                * [(x_name, x) for x_name, x in zip(x_names, xs)],
+                * [(y_name, y) for y_name, y in zip(y_names, ys)],
+            ])
+        else:
+            return_batch = OrderedDict(
+                [(x_name, x) for x_name, x in zip(x_names, xs)])
+        return return_batch, uttid_list
+
+    def _get_from_loader(self, filepath, filetype):
+        """Return ndarray
+
+        In order to make the fds to be opened only at the first referring,
+        the loader are stored in self._loaders
+
+        >>> ndarray = loader.get_from_loader(
+        ...     'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
+
+        :param: str filepath:
+        :param: str filetype:
+        :return:
+        :rtype: np.ndarray
+        """
+        if filetype == "hdf5":
+            # e.g.
+            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+            #                "filetype": "hdf5",
+            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
+            filepath, key = filepath.split(":", 1)
+
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = h5py.File(filepath, "r")
+                self._loaders[filepath] = loader
+            return loader[key][()]
+        elif filetype == "sound.hdf5":
+            # e.g.
+            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
+            #                "filetype": "sound.hdf5",
+            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
+            filepath, key = filepath.split(":", 1)
+
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = SoundHDF5File(filepath, "r", dtype="int16")
+                self._loaders[filepath] = loader
+            array, rate = loader[key]
+            return array
+        elif filetype == "sound":
+            # e.g.
+            #    {"input": [{"feat": "some/path.wav",
+            #                "filetype": "sound"},
+            # Assume PCM16
+            if not self.keep_all_data_on_mem:
+                array, _ = soundfile.read(filepath, dtype="int16")
+                return array
+            if filepath not in self._loaders:
+                array, _ = soundfile.read(filepath, dtype="int16")
+                self._loaders[filepath] = array
+            return self._loaders[filepath]
+        elif filetype == "npz":
+            # e.g.
+            #    {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
+            #                "filetype": "npz",
+            filepath, key = filepath.split(":", 1)
+
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = np.load(filepath)
+                self._loaders[filepath] = loader
+            return loader[key]
+        elif filetype == "npy":
+            # e.g.
+            #    {"input": [{"feat": "some/path.npy",
+            #                "filetype": "npy"},
+            if not self.keep_all_data_on_mem:
+                return np.load(filepath)
+            if filepath not in self._loaders:
+                self._loaders[filepath] = np.load(filepath)
+            return self._loaders[filepath]
+        elif filetype in ["mat", "vec"]:
+            # e.g.
+            #    {"input": [{"feat": "some/path.ark:123",
+            #                "filetype": "mat"}]},
+            # In this case, "123" indicates the starting points of the matrix
+            # load_mat can load both matrix and vector
+            if not self.keep_all_data_on_mem:
+                return kaldiio.load_mat(filepath)
+            if filepath not in self._loaders:
+                self._loaders[filepath] = kaldiio.load_mat(filepath)
+            return self._loaders[filepath]
+        elif filetype == "scp":
+            # e.g.
+            #    {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
+            #                "filetype": "scp",
+            filepath, key = filepath.split(":", 1)
+            loader = self._loaders.get(filepath)
+            if loader is None:
+                # To avoid disk access, create loader only for the first time
+                loader = kaldiio.load_scp(filepath)
+                self._loaders[filepath] = loader
+            return loader[key]
+        else:
+            raise NotImplementedError(
+                "Not supported: loader_type={}".format(filetype))
+
+
+class SoundHDF5File():
+    """Collecting sound files to a HDF5 file
+
+    >>> f = SoundHDF5File('a.flac.h5', mode='a')
+    >>> array = np.random.randint(0, 100, 100, dtype=np.int16)
+    >>> f['id'] = (array, 16000)
+    >>> array, rate = f['id']
+
+
+    :param: str filepath:
+    :param: str mode:
+    :param: str format: The type used when saving wav. flac, nist, htk, etc.
+    :param: str dtype:
+
+    """
+
+    def __init__(self,
+                 filepath,
+                 mode="r+",
+                 format=None,
+                 dtype="int16",
+                 **kwargs):
+        self.filepath = filepath
+        self.mode = mode
+        self.dtype = dtype
+
+        self.file = h5py.File(filepath, mode, **kwargs)
+        if format is None:
+            # filepath = a.flac.h5 -> format = flac
+            second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1]
+            format = second_ext[1:]
+            if format.upper() not in soundfile.available_formats():
+                # If not found, flac is selected
+                format = "flac"
+
+        # This format affects only saving
+        self.format = format
+
+    def __repr__(self):
+        return '<SoundHDF5 file "{}" (mode {}, format {}, type {})>'.format(
+            self.filepath, self.mode, self.format, self.dtype)
+
+    def create_dataset(self, name, shape=None, data=None, **kwds):
+        f = io.BytesIO()
+        array, rate = data
+        soundfile.write(f, array, rate, format=self.format)
+        self.file.create_dataset(
+            name, shape=shape, data=np.void(f.getvalue()), **kwds)
+
+    def __setitem__(self, name, data):
+        self.create_dataset(name, data=data)
+
+    def __getitem__(self, key):
+        data = self.file[key][()]
+        f = io.BytesIO(data.tobytes())
+        array, rate = soundfile.read(f, dtype=self.dtype)
+        return array, rate
+
+    def keys(self):
+        return self.file.keys()
+
+    def values(self):
+        for k in self.file:
+            yield self[k]
+
+    def items(self):
+        for k in self.file:
+            yield k, self[k]
+
+    def __iter__(self):
+        return iter(self.file)
+
+    def __contains__(self, item):
+        return item in self.file
+
+    def __len__(self, item):
+        return len(self.file)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.file.close()
+
+    def close(self):
+        self.file.close()
diff --git a/deepspeech/io/utility.py b/deepspeech/io/utility.py
index 91abdf088..99487a0af 100644
--- a/deepspeech/io/utility.py
+++ b/deepspeech/io/utility.py
@@ -11,17 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from collections import OrderedDict
 from typing import List
 
-import kaldiio
 import numpy as np
-import soundfile
 
-from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.utils.log import Log
 
-__all__ = ["pad_list", "pad_sequence", "LoadInputsAndTargets"]
+__all__ = ["pad_list", "pad_sequence"]
 
 logger = Log(__name__).getlog()
 
@@ -89,387 +85,3 @@ def pad_sequence(sequences: List[np.ndarray],
             out_tensor[:length, i, ...] = tensor
 
     return out_tensor
-
-
-class LoadInputsAndTargets():
-    """Create a mini-batch from a list of dicts
-
-    >>> batch = [('utt1',
-    ...           dict(input=[dict(feat='some.ark:123',
-    ...                            filetype='mat',
-    ...                            name='input1',
-    ...                            shape=[100, 80])],
-    ...                output=[dict(tokenid='1 2 3 4',
-    ...                             name='target1',
-    ...                             shape=[4, 31])]]))
-    >>> l = LoadInputsAndTargets()
-    >>> feat, target = l(batch)
-
-    :param: str mode: Specify the task mode, "asr" or "tts"
-    :param: str preprocess_conf: The path of a json file for pre-processing
-    :param: bool load_input: If False, not to load the input data
-    :param: bool load_output: If False, not to load the output data
-    :param: bool sort_in_input_length: Sort the mini-batch in descending order
-        of the input length
-    :param: bool use_speaker_embedding: Used for tts mode only
-    :param: bool use_second_target: Used for tts mode only
-    :param: dict preprocess_args: Set some optional arguments for preprocessing
-    :param: Optional[dict] preprocess_args: Used for tts mode only
-    """
-
-    def __init__(
-            self,
-            mode="asr",
-            preprocess_conf=None,
-            load_input=True,
-            load_output=True,
-            sort_in_input_length=True,
-            preprocess_args=None,
-            keep_all_data_on_mem=False, ):
-        self._loaders = {}
-
-        if mode not in ["asr"]:
-            raise ValueError("Only asr are allowed: mode={}".format(mode))
-
-        if preprocess_conf is not None:
-            self.preprocessing = AugmentationPipeline(preprocess_conf)
-            logging.warning(
-                "[Experimental feature] Some preprocessing will be done "
-                "for the mini-batch creation using {}".format(
-                    self.preprocessing))
-        else:
-            # If conf doesn't exist, this function don't touch anything.
-            self.preprocessing = None
-
-        self.mode = mode
-        self.load_output = load_output
-        self.load_input = load_input
-        self.sort_in_input_length = sort_in_input_length
-        if preprocess_args is None:
-            self.preprocess_args = {}
-        else:
-            assert isinstance(preprocess_args, dict), type(preprocess_args)
-            self.preprocess_args = dict(preprocess_args)
-
-        self.keep_all_data_on_mem = keep_all_data_on_mem
-
-    def __call__(self, batch, return_uttid=False):
-        """Function to load inputs and targets from list of dicts
-
-        :param List[Tuple[str, dict]] batch: list of dict which is subset of
-            loaded data.json
-        :param bool return_uttid: return utterance ID information for visualization
-        :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]
-        :return: list of input feature sequences
-            [(T_1, D), (T_2, D), ..., (T_B, D)]
-        :rtype: list of float ndarray
-        :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]
-        :rtype: list of int ndarray
-
-        """
-        x_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
-        y_feats_dict = OrderedDict()  # OrderedDict[str, List[np.ndarray]]
-        uttid_list = []  # List[str]
-
-        for uttid, info in batch:
-            uttid_list.append(uttid)
-
-            if self.load_input:
-                # Note(kamo): This for-loop is for multiple inputs
-                for idx, inp in enumerate(info["input"]):
-                    # {"input":
-                    #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
-                    #    "filetype": "hdf5",
-                    #    "name": "input1", ...}], ...}
-                    x = self._get_from_loader(
-                        filepath=inp["feat"],
-                        filetype=inp.get("filetype", "mat"))
-                    x_feats_dict.setdefault(inp["name"], []).append(x)
-
-            if self.load_output:
-                for idx, inp in enumerate(info["output"]):
-                    if "tokenid" in inp:
-                        # ======= Legacy format for output =======
-                        # {"output": [{"tokenid": "1 2 3 4"}])
-                        x = np.fromiter(
-                            map(int, inp["tokenid"].split()), dtype=np.int64)
-                    else:
-                        # ======= New format =======
-                        # {"input":
-                        #  [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
-                        #    "filetype": "hdf5",
-                        #    "name": "target1", ...}], ...}
-                        x = self._get_from_loader(
-                            filepath=inp["feat"],
-                            filetype=inp.get("filetype", "mat"))
-
-                    y_feats_dict.setdefault(inp["name"], []).append(x)
-
-        if self.mode == "asr":
-            return_batch, uttid_list = self._create_batch_asr(
-                x_feats_dict, y_feats_dict, uttid_list)
-        else:
-            raise NotImplementedError(self.mode)
-
-        if self.preprocessing is not None:
-            # Apply pre-processing all input features
-            for x_name in return_batch.keys():
-                if x_name.startswith("input"):
-                    return_batch[x_name] = self.preprocessing(
-                        return_batch[x_name], uttid_list,
-                        **self.preprocess_args)
-
-        if return_uttid:
-            return tuple(return_batch.values()), uttid_list
-
-        # Doesn't return the names now.
-        return tuple(return_batch.values())
-
-    def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):
-        """Create a OrderedDict for the mini-batch
-
-        :param OrderedDict x_feats_dict:
-            e.g. {"input1": [ndarray, ndarray, ...],
-                  "input2": [ndarray, ndarray, ...]}
-        :param OrderedDict y_feats_dict:
-            e.g. {"target1": [ndarray, ndarray, ...],
-                  "target2": [ndarray, ndarray, ...]}
-        :param: List[str] uttid_list:
-            Give uttid_list to sort in the same order as the mini-batch
-        :return: batch, uttid_list
-        :rtype: Tuple[OrderedDict, List[str]]
-        """
-        # handle single-input and multi-input (paralell) asr mode
-        xs = list(x_feats_dict.values())
-
-        if self.load_output:
-            ys = list(y_feats_dict.values())
-            assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))
-
-            # get index of non-zero length samples
-            nonzero_idx = list(
-                filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))
-            for n in range(1, len(y_feats_dict)):
-                nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)
-        else:
-            # Note(kamo): Be careful not to make nonzero_idx to a generator
-            nonzero_idx = list(range(len(xs[0])))
-
-        if self.sort_in_input_length:
-            # sort in input lengths based on the first input
-            nonzero_sorted_idx = sorted(
-                nonzero_idx, key=lambda i: -len(xs[0][i]))
-        else:
-            nonzero_sorted_idx = nonzero_idx
-
-        if len(nonzero_sorted_idx) != len(xs[0]):
-            logging.warning(
-                "Target sequences include empty tokenid (batch {} -> {}).".
-                format(len(xs[0]), len(nonzero_sorted_idx)))
-
-        # remove zero-length samples
-        xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]
-        uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]
-
-        x_names = list(x_feats_dict.keys())
-        if self.load_output:
-            ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]
-            y_names = list(y_feats_dict.keys())
-
-            # Keeping x_name and y_name, e.g. input1, for future extension
-            return_batch = OrderedDict([
-                * [(x_name, x) for x_name, x in zip(x_names, xs)],
-                * [(y_name, y) for y_name, y in zip(y_names, ys)],
-            ])
-        else:
-            return_batch = OrderedDict(
-                [(x_name, x) for x_name, x in zip(x_names, xs)])
-        return return_batch, uttid_list
-
-    def _get_from_loader(self, filepath, filetype):
-        """Return ndarray
-
-        In order to make the fds to be opened only at the first referring,
-        the loader are stored in self._loaders
-
-        >>> ndarray = loader.get_from_loader(
-        ...     'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')
-
-        :param: str filepath:
-        :param: str filetype:
-        :return:
-        :rtype: np.ndarray
-        """
-        if filetype == "hdf5":
-            # e.g.
-            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
-            #                "filetype": "hdf5",
-            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
-            filepath, key = filepath.split(":", 1)
-
-            loader = self._loaders.get(filepath)
-            if loader is None:
-                # To avoid disk access, create loader only for the first time
-                loader = h5py.File(filepath, "r")
-                self._loaders[filepath] = loader
-            return loader[key][()]
-        elif filetype == "sound.hdf5":
-            # e.g.
-            #    {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
-            #                "filetype": "sound.hdf5",
-            # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL"
-            filepath, key = filepath.split(":", 1)
-
-            loader = self._loaders.get(filepath)
-            if loader is None:
-                # To avoid disk access, create loader only for the first time
-                loader = SoundHDF5File(filepath, "r", dtype="int16")
-                self._loaders[filepath] = loader
-            array, rate = loader[key]
-            return array
-        elif filetype == "sound":
-            # e.g.
-            #    {"input": [{"feat": "some/path.wav",
-            #                "filetype": "sound"},
-            # Assume PCM16
-            if not self.keep_all_data_on_mem:
-                array, _ = soundfile.read(filepath, dtype="int16")
-                return array
-            if filepath not in self._loaders:
-                array, _ = soundfile.read(filepath, dtype="int16")
-                self._loaders[filepath] = array
-            return self._loaders[filepath]
-        elif filetype == "npz":
-            # e.g.
-            #    {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL",
-            #                "filetype": "npz",
-            filepath, key = filepath.split(":", 1)
-
-            loader = self._loaders.get(filepath)
-            if loader is None:
-                # To avoid disk access, create loader only for the first time
-                loader = np.load(filepath)
-                self._loaders[filepath] = loader
-            return loader[key]
-        elif filetype == "npy":
-            # e.g.
-            #    {"input": [{"feat": "some/path.npy",
-            #                "filetype": "npy"},
-            if not self.keep_all_data_on_mem:
-                return np.load(filepath)
-            if filepath not in self._loaders:
-                self._loaders[filepath] = np.load(filepath)
-            return self._loaders[filepath]
-        elif filetype in ["mat", "vec"]:
-            # e.g.
-            #    {"input": [{"feat": "some/path.ark:123",
-            #                "filetype": "mat"}]},
-            # In this case, "123" indicates the starting points of the matrix
-            # load_mat can load both matrix and vector
-            if not self.keep_all_data_on_mem:
-                return kaldiio.load_mat(filepath)
-            if filepath not in self._loaders:
-                self._loaders[filepath] = kaldiio.load_mat(filepath)
-            return self._loaders[filepath]
-        elif filetype == "scp":
-            # e.g.
-            #    {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL",
-            #                "filetype": "scp",
-            filepath, key = filepath.split(":", 1)
-            loader = self._loaders.get(filepath)
-            if loader is None:
-                # To avoid disk access, create loader only for the first time
-                loader = kaldiio.load_scp(filepath)
-                self._loaders[filepath] = loader
-            return loader[key]
-        else:
-            raise NotImplementedError(
-                "Not supported: loader_type={}".format(filetype))
-
-
-class SoundHDF5File():
-    """Collecting sound files to a HDF5 file
-
-    >>> f = SoundHDF5File('a.flac.h5', mode='a')
-    >>> array = np.random.randint(0, 100, 100, dtype=np.int16)
-    >>> f['id'] = (array, 16000)
-    >>> array, rate = f['id']
-
-
-    :param: str filepath:
-    :param: str mode:
-    :param: str format: The type used when saving wav. flac, nist, htk, etc.
-    :param: str dtype:
-
-    """
-
-    def __init__(self,
-                 filepath,
-                 mode="r+",
-                 format=None,
-                 dtype="int16",
-                 **kwargs):
-        self.filepath = filepath
-        self.mode = mode
-        self.dtype = dtype
-
-        self.file = h5py.File(filepath, mode, **kwargs)
-        if format is None:
-            # filepath = a.flac.h5 -> format = flac
-            second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1]
-            format = second_ext[1:]
-            if format.upper() not in soundfile.available_formats():
-                # If not found, flac is selected
-                format = "flac"
-
-        # This format affects only saving
-        self.format = format
-
-    def __repr__(self):
-        return '<SoundHDF5 file "{}" (mode {}, format {}, type {})>'.format(
-            self.filepath, self.mode, self.format, self.dtype)
-
-    def create_dataset(self, name, shape=None, data=None, **kwds):
-        f = io.BytesIO()
-        array, rate = data
-        soundfile.write(f, array, rate, format=self.format)
-        self.file.create_dataset(
-            name, shape=shape, data=np.void(f.getvalue()), **kwds)
-
-    def __setitem__(self, name, data):
-        self.create_dataset(name, data=data)
-
-    def __getitem__(self, key):
-        data = self.file[key][()]
-        f = io.BytesIO(data.tobytes())
-        array, rate = soundfile.read(f, dtype=self.dtype)
-        return array, rate
-
-    def keys(self):
-        return self.file.keys()
-
-    def values(self):
-        for k in self.file:
-            yield self[k]
-
-    def items(self):
-        for k in self.file:
-            yield k, self[k]
-
-    def __iter__(self):
-        return iter(self.file)
-
-    def __contains__(self, item):
-        return item in self.file
-
-    def __len__(self, item):
-        return len(self.file)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.file.close()
-
-    def close(self):
-        self.file.close()

From b602382ee1f6607423ba549ee9fba344571fe603 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 17 Aug 2021 09:47:18 +0000
Subject: [PATCH 12/17] update test

---
 .notebook/espnet_dataloader.ipynb | 44 +++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/.notebook/espnet_dataloader.ipynb b/.notebook/espnet_dataloader.ipynb
index 12870a8eb..7abb138ff 100644
--- a/.notebook/espnet_dataloader.ipynb
+++ b/.notebook/espnet_dataloader.ipynb
@@ -1058,7 +1058,7 @@
   {
    "cell_type": "code",
    "execution_count": 34,
-   "id": "7f0307eb",
+   "id": "502d3f4d",
    "metadata": {},
    "outputs": [
     {
@@ -1186,7 +1186,7 @@
   {
    "cell_type": "code",
    "execution_count": 84,
-   "id": "1b6508fc",
+   "id": "0b92ade5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1196,7 +1196,7 @@
   {
    "cell_type": "code",
    "execution_count": 85,
-   "id": "25d655c0",
+   "id": "8dbd847c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1206,7 +1206,7 @@
   {
    "cell_type": "code",
    "execution_count": 87,
-   "id": "a28e5141",
+   "id": "31c085f4",
    "metadata": {},
    "outputs": [
     {
@@ -1300,10 +1300,44 @@
     "print(olen.dtype)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "72e9ba60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "64593e5f",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'str' object has no attribute 'stat'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m/tmp/ipykernel_48616/3505477735.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'xxxxxxxx'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mPath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/usr/local/lib/python3.7/pathlib.py\u001b[0m in \u001b[0;36mis_file\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1342\u001b[0m         \"\"\"\n\u001b[1;32m   1343\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1344\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mS_ISREG\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mst_mode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1345\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1346\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mENOENT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mENOTDIR\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'stat'"
+     ]
+    }
+   ],
+   "source": [
+    "s='xxxxxxxx'\n",
+    "Path.is_file(s)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1d981df4",
+   "id": "fcea3fd0",
    "metadata": {},
    "outputs": [],
    "source": []

From 4e4c242b0939b2a2e0da748c8647e1ad5c5ef817 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 17 Aug 2021 09:49:33 +0000
Subject: [PATCH 13/17] fix bugs

---
 .bashrc                                       | 10 ----------
 .notebook/u2_confermer_model_wenet.ipynb      |  2 +-
 deepspeech/frontend/augmentor/augmentation.py |  5 +----
 deepspeech/io/dataset.py                      |  1 +
 deepspeech/models/ds2/rnn.py                  |  2 +-
 deepspeech/models/u2.py                       |  2 +-
 deepspeech/models/u2_st.py                    |  2 +-
 deepspeech/modules/decoder.py                 |  4 ++--
 deepspeech/modules/decoder_layer.py           | 14 +++++++-------
 deepspeech/modules/encoder.py                 |  4 ++--
 deepspeech/modules/rnn.py                     |  2 +-
 examples/librispeech/s0/conf/deepspeech2.yaml |  2 +-
 12 files changed, 19 insertions(+), 31 deletions(-)
 delete mode 100755 .bashrc

diff --git a/.bashrc b/.bashrc
deleted file mode 100755
index 15131969a..000000000
--- a/.bashrc
+++ /dev/null
@@ -1,10 +0,0 @@
-# Locales
-
-export LC_ALL=en_US.UTF-8
-export LANG=en_US.UTF-8
-export LANGUAGE=en_US.UTF-8
-
-# Aliases
-alias nvs="nvidia-smi"
-alias rsync="rsync --progress -raz"
-alias his="history"
diff --git a/.notebook/u2_confermer_model_wenet.ipynb b/.notebook/u2_confermer_model_wenet.ipynb
index 4f2c9632f..a425e16cb 100644
--- a/.notebook/u2_confermer_model_wenet.ipynb
+++ b/.notebook/u2_confermer_model_wenet.ipynb
@@ -3431,7 +3431,7 @@
     "        convolution_layer_args = (output_size, cnn_module_kernel, activation,\n",
     "                                  cnn_module_norm, causal)\n",
     "\n",
-    "        self.encoders = nn.ModuleList([\n",
+    "        self.encoders = nn.LayerList([\n",
     "            ConformerEncoderLayer(\n",
     "                size=output_size,\n",
     "                self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),\n",
diff --git a/deepspeech/frontend/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py
index a61ca37b8..cfebc463c 100644
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@@ -164,8 +164,6 @@ class AugmentationPipeline():
         :param audio_segment: Audio segment to process.
         :type audio_segment: AudioSegmenet|SpeechSegment
         """
-        if not self._train:
-            return
         for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
             if self._rng.uniform(0., 1.) < rate:
                 augmentor.transform_audio(audio_segment)
@@ -176,8 +174,6 @@ class AugmentationPipeline():
         Args:
             spec_segment (np.ndarray): audio feature, (D, T).
         """
-        if not self._train:
-            return
         for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
             if self._rng.uniform(0., 1.) < rate:
                 spec_segment = augmentor.transform_feature(spec_segment)
@@ -217,3 +213,4 @@ class AugmentationPipeline():
             obj = class_obj(self._rng, **params)
         except Exception:
             raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
+        return obj
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index a7bf1fc24..259b3b490 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from typing import Optional
 
+import numpy as np
 from paddle.io import Dataset
 from yacs.config import CfgNode
 
diff --git a/deepspeech/models/ds2/rnn.py b/deepspeech/models/ds2/rnn.py
index 01b55c4a2..0d8c9fd2c 100644
--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
@@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
                         share_weights=share_rnn_weights))
             i_size = h_size * 2
 
-        self.rnn_stacks = nn.ModuleList(rnn_stacks)
+        self.rnn_stacks = nn.LayerList(rnn_stacks)
 
     def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
         """
diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py
index f1d466a27..7ed16c9d2 100644
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@@ -54,7 +54,7 @@ __all__ = ["U2Model", "U2InferModel"]
 logger = Log(__name__).getlog()
 
 
-class U2BaseModel(nn.Module):
+class U2BaseModel(nn.Layer):
     """CTC-Attention hybrid Encoder-Decoder model"""
 
     @classmethod
diff --git a/deepspeech/models/u2_st.py b/deepspeech/models/u2_st.py
index a73f52e99..99420a89c 100644
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@@ -48,7 +48,7 @@ __all__ = ["U2STModel", "U2STInferModel"]
 logger = Log(__name__).getlog()
 
 
-class U2STBaseModel(nn.Module):
+class U2STBaseModel(nn.Layer):
     """CTC-Attention hybrid Encoder-Decoder model"""
 
     @classmethod
diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py
index 696a6315b..87c9fa492 100644
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@@ -33,7 +33,7 @@ logger = Log(__name__).getlog()
 __all__ = ["TransformerDecoder"]
 
 
-class TransformerDecoder(nn.Module):
+class TransformerDecoder(nn.Layer):
     """Base class of Transfomer decoder module.
     Args:
         vocab_size: output dim
@@ -86,7 +86,7 @@ class TransformerDecoder(nn.Module):
         self.use_output_layer = use_output_layer
         self.output_layer = nn.Linear(attention_dim, vocab_size)
 
-        self.decoders = nn.ModuleList([
+        self.decoders = nn.LayerList([
             DecoderLayer(
                 size=attention_dim,
                 self_attn=MultiHeadedAttention(attention_heads, attention_dim,
diff --git a/deepspeech/modules/decoder_layer.py b/deepspeech/modules/decoder_layer.py
index c6fac5412..47c42615e 100644
--- a/deepspeech/modules/decoder_layer.py
+++ b/deepspeech/modules/decoder_layer.py
@@ -25,15 +25,15 @@ logger = Log(__name__).getlog()
 __all__ = ["DecoderLayer"]
 
 
-class DecoderLayer(nn.Module):
+class DecoderLayer(nn.Layer):
     """Single decoder layer module.
     Args:
         size (int): Input dimension.
-        self_attn (nn.Module): Self-attention module instance.
+        self_attn (nn.Layer): Self-attention module instance.
             `MultiHeadedAttention` instance can be used as the argument.
-        src_attn (nn.Module): Self-attention module instance.
+        src_attn (nn.Layer): Self-attention module instance.
             `MultiHeadedAttention` instance can be used as the argument.
-        feed_forward (nn.Module): Feed-forward module instance.
+        feed_forward (nn.Layer): Feed-forward module instance.
             `PositionwiseFeedForward` instance can be used as the argument.
         dropout_rate (float): Dropout rate.
         normalize_before (bool):
@@ -48,9 +48,9 @@ class DecoderLayer(nn.Module):
     def __init__(
             self,
             size: int,
-            self_attn: nn.Module,
-            src_attn: nn.Module,
-            feed_forward: nn.Module,
+            self_attn: nn.Layer,
+            src_attn: nn.Layer,
+            feed_forward: nn.Layer,
             dropout_rate: float,
             normalize_before: bool=True,
             concat_after: bool=False, ):
diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py
index 27e0f8d78..71ec61a0e 100644
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@@ -358,7 +358,7 @@ class TransformerEncoder(BaseEncoder):
                          pos_enc_layer_type, normalize_before, concat_after,
                          static_chunk_size, use_dynamic_chunk, global_cmvn,
                          use_dynamic_left_chunk)
-        self.encoders = nn.ModuleList([
+        self.encoders = nn.LayerList([
             TransformerEncoderLayer(
                 size=output_size,
                 self_attn=MultiHeadedAttention(attention_heads, output_size,
@@ -438,7 +438,7 @@ class ConformerEncoder(BaseEncoder):
         convolution_layer_args = (output_size, cnn_module_kernel, activation,
                                   cnn_module_norm, causal)
 
-        self.encoders = nn.ModuleList([
+        self.encoders = nn.LayerList([
             ConformerEncoderLayer(
                 size=output_size,
                 self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),
diff --git a/deepspeech/modules/rnn.py b/deepspeech/modules/rnn.py
index 01b55c4a2..0d8c9fd2c 100644
--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
                         share_weights=share_rnn_weights))
             i_size = h_size * 2
 
-        self.rnn_stacks = nn.ModuleList(rnn_stacks)
+        self.rnn_stacks = nn.LayerList(rnn_stacks)
 
     def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
         """
diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml
index acee94c3e..dab8d0462 100644
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -32,7 +32,7 @@ collator:
   keep_transcription_text: False
   sortagrad: True 
   shuffle_method: batch_shuffle
-  num_workers: 0
+  num_workers: 2
 
 model:
   num_conv_layers: 2

From 009b7a0b0b83d6110ce58d5d9adfb4826cdc5574 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 18 Aug 2021 07:11:37 +0000
Subject: [PATCH 14/17] refactor converter

---
 deepspeech/io/__init__.py   | 136 ------------------------------------
 deepspeech/io/converter.py  |  80 +++++++++++++++++++++
 deepspeech/io/dataloader.py |   2 +-
 deepspeech/io/dataset.py    |  66 +----------------
 4 files changed, 82 insertions(+), 202 deletions(-)
 create mode 100644 deepspeech/io/converter.py

diff --git a/deepspeech/io/__init__.py b/deepspeech/io/__init__.py
index e180f18ee..185a92b8d 100644
--- a/deepspeech/io/__init__.py
+++ b/deepspeech/io/__init__.py
@@ -11,139 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
-from paddle.io import DataLoader
-
-from deepspeech.io.collator import SpeechCollator
-from deepspeech.io.dataset import ManifestDataset
-from deepspeech.io.sampler import SortagradBatchSampler
-from deepspeech.io.sampler import SortagradDistributedBatchSampler
-
-
-def create_dataloader(manifest_path,
-                      unit_type,
-                      vocab_filepath,
-                      mean_std_filepath,
-                      spm_model_prefix,
-                      augmentation_config='{}',
-                      max_input_len=float('inf'),
-                      min_input_len=0.0,
-                      max_output_len=float('inf'),
-                      min_output_len=0.0,
-                      max_output_input_ratio=float('inf'),
-                      min_output_input_ratio=0.0,
-                      stride_ms=10.0,
-                      window_ms=20.0,
-                      max_freq=None,
-                      specgram_type='linear',
-                      feat_dim=None,
-                      delta_delta=False,
-                      use_dB_normalization=True,
-                      random_seed=0,
-                      keep_transcription_text=False,
-                      is_training=False,
-                      batch_size=1,
-                      num_workers=0,
-                      sortagrad=False,
-                      shuffle_method=None,
-                      dist=False):
-
-    dataset = ManifestDataset(
-        manifest_path=manifest_path,
-        unit_type=unit_type,
-        vocab_filepath=vocab_filepath,
-        mean_std_filepath=mean_std_filepath,
-        spm_model_prefix=spm_model_prefix,
-        augmentation_config=augmentation_config,
-        max_input_len=max_input_len,
-        min_input_len=min_input_len,
-        max_output_len=max_output_len,
-        min_output_len=min_output_len,
-        max_output_input_ratio=max_output_input_ratio,
-        min_output_input_ratio=min_output_input_ratio,
-        stride_ms=stride_ms,
-        window_ms=window_ms,
-        max_freq=max_freq,
-        specgram_type=specgram_type,
-        feat_dim=feat_dim,
-        delta_delta=delta_delta,
-        use_dB_normalization=use_dB_normalization,
-        random_seed=random_seed,
-        keep_transcription_text=keep_transcription_text)
-
-    if dist:
-        batch_sampler = SortagradDistributedBatchSampler(
-            dataset,
-            batch_size,
-            num_replicas=None,
-            rank=None,
-            shuffle=is_training,
-            drop_last=is_training,
-            sortagrad=is_training,
-            shuffle_method=shuffle_method)
-    else:
-        batch_sampler = SortagradBatchSampler(
-            dataset,
-            shuffle=is_training,
-            batch_size=batch_size,
-            drop_last=is_training,
-            sortagrad=is_training,
-            shuffle_method=shuffle_method)
-
-    def padding_batch(batch,
-                      padding_to=-1,
-                      flatten=False,
-                      keep_transcription_text=True):
-        """	
-        Padding audio features with zeros to make them have the same shape (or	
-        a user-defined shape) within one bach.	
-
-        If ``padding_to`` is -1, the maximun shape in the batch will be used	
-        as the target shape for padding. Otherwise, `padding_to` will be the	
-        target shape (only refers to the second axis).	
-
-        If `flatten` is True, features will be flatten to 1darray.	
-        """
-        new_batch = []
-        # get target shape	
-        max_length = max([audio.shape[1] for audio, text in batch])
-        if padding_to != -1:
-            if padding_to < max_length:
-                raise ValueError("If padding_to is not -1, it should be larger "
-                                 "than any instance's shape in the batch")
-            max_length = padding_to
-        max_text_length = max([len(text) for audio, text in batch])
-        # padding	
-        padded_audios = []
-        audio_lens = []
-        texts, text_lens = [], []
-        for audio, text in batch:
-            padded_audio = np.zeros([audio.shape[0], max_length])
-            padded_audio[:, :audio.shape[1]] = audio
-            if flatten:
-                padded_audio = padded_audio.flatten()
-            padded_audios.append(padded_audio)
-            audio_lens.append(audio.shape[1])
-
-            padded_text = np.zeros([max_text_length])
-            if keep_transcription_text:
-                padded_text[:len(text)] = [ord(t) for t in text]  # string
-            else:
-                padded_text[:len(text)] = text  # ids
-            texts.append(padded_text)
-            text_lens.append(len(text))
-
-        padded_audios = np.array(padded_audios).astype('float32')
-        audio_lens = np.array(audio_lens).astype('int64')
-        texts = np.array(texts).astype('int32')
-        text_lens = np.array(text_lens).astype('int64')
-        return padded_audios, audio_lens, texts, text_lens
-
-    # collate_fn=functools.partial(padding_batch, keep_transcription_text=keep_transcription_text),
-    collate_fn = SpeechCollator(keep_transcription_text=keep_transcription_text)
-    loader = DataLoader(
-        dataset,
-        batch_sampler=batch_sampler,
-        collate_fn=collate_fn,
-        num_workers=num_workers)
-    return loader
diff --git a/deepspeech/io/converter.py b/deepspeech/io/converter.py
new file mode 100644
index 000000000..a02e06acb
--- /dev/null
+++ b/deepspeech/io/converter.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from deepspeech.io.utility import pad_list
+from deepspeech.utils.log import Log
+
+__all__ = ["CustomConverter"]
+
+logger = Log(__name__).getlog()
+
+
+class CustomConverter():
+    """Custom batch converter.
+
+    Args:
+        subsampling_factor (int): The subsampling factor.
+        dtype (np.dtype): Data type to convert.
+        
+    """
+
+    def __init__(self, subsampling_factor=1, dtype=np.float32):
+        """Construct a CustomConverter object."""
+        self.subsampling_factor = subsampling_factor
+        self.ignore_id = -1
+        self.dtype = dtype
+
+    def __call__(self, batch):
+        """Transform a batch and send it to a device.
+
+        Args:
+            batch (list): The batch to transform.
+
+        Returns:
+            tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
+
+        """
+        # batch should be located in list
+        assert len(batch) == 1
+        (xs, ys), utts = batch[0]
+
+        # perform subsampling
+        if self.subsampling_factor > 1:
+            xs = [x[::self.subsampling_factor, :] for x in xs]
+
+        # get batch of lengths of input sequences
+        ilens = np.array([x.shape[0] for x in xs])
+
+        # perform padding and convert to tensor
+        # currently only support real number
+        if xs[0].dtype.kind == "c":
+            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
+            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
+            # Note(kamo):
+            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
+            # Don't create ComplexTensor and give it E2E here
+            # because torch.nn.DataParellel can't handle it.
+            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
+        else:
+            xs_pad = pad_list(xs, 0).astype(self.dtype)
+
+        # NOTE: this is for multi-output (e.g., speech translation)
+        ys_pad = pad_list(
+            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
+            self.ignore_id)
+
+        olens = np.array(
+            [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
+        return utts, xs_pad, ilens, ys_pad, olens
diff --git a/deepspeech/io/dataloader.py b/deepspeech/io/dataloader.py
index b993d9a1a..3c4c2d5ef 100644
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
@@ -15,8 +15,8 @@ from paddle.io import DataLoader
 
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.io.batchfy import make_batchset
-from deepspeech.io.dataset import CustomConverter
 from deepspeech.io.dataset import TransformDataset
+from deepspeech.io.reader import CustomConverter
 from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.utils.log import Log
 
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 259b3b490..74c08b461 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -13,18 +13,13 @@
 # limitations under the License.
 from typing import Optional
 
-import numpy as np
 from paddle.io import Dataset
 from yacs.config import CfgNode
 
 from deepspeech.frontend.utility import read_manifest
-from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log
 
-__all__ = [
-    "ManifestDataset", "TripletManifestDataset", "TransformDataset",
-    "CustomConverter"
-]
+__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
 
 logger = Log(__name__).getlog()
 
@@ -129,65 +124,6 @@ class TripletManifestDataset(ManifestDataset):
             "text1"]
 
 
-class CustomConverter():
-    """Custom batch converter.
-
-    Args:
-        subsampling_factor (int): The subsampling factor.
-        dtype (np.dtype): Data type to convert.
-        
-    """
-
-    def __init__(self, subsampling_factor=1, dtype=np.float32):
-        """Construct a CustomConverter object."""
-        self.subsampling_factor = subsampling_factor
-        self.ignore_id = -1
-        self.dtype = dtype
-
-    def __call__(self, batch):
-        """Transform a batch and send it to a device.
-
-        Args:
-            batch (list): The batch to transform.
-
-        Returns:
-            tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
-
-        """
-        # batch should be located in list
-        assert len(batch) == 1
-        (xs, ys), utts = batch[0]
-
-        # perform subsampling
-        if self.subsampling_factor > 1:
-            xs = [x[::self.subsampling_factor, :] for x in xs]
-
-        # get batch of lengths of input sequences
-        ilens = np.array([x.shape[0] for x in xs])
-
-        # perform padding and convert to tensor
-        # currently only support real number
-        if xs[0].dtype.kind == "c":
-            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
-            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
-            # Note(kamo):
-            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
-            # Don't create ComplexTensor and give it E2E here
-            # because torch.nn.DataParellel can't handle it.
-            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
-        else:
-            xs_pad = pad_list(xs, 0).astype(self.dtype)
-
-        # NOTE: this is for multi-output (e.g., speech translation)
-        ys_pad = pad_list(
-            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
-            self.ignore_id)
-
-        olens = np.array(
-            [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
-        return utts, xs_pad, ilens, ys_pad, olens
-
-
 class TransformDataset(Dataset):
     """Transform Dataset.
 

From bc9f444d8a31c4751d4aef5e4f90c37f2c3cc4cb Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 18 Aug 2021 07:53:52 +0000
Subject: [PATCH 15/17] add dataloader; check augmenter base class type

---
 deepspeech/frontend/augmentor/augmentation.py |  2 +
 deepspeech/io/dataloader.py                   | 53 +++++++++++++------
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/deepspeech/frontend/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py
index cfebc463c..7b43988e4 100644
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@@ -18,6 +18,7 @@ from inspect import signature
 
 import numpy as np
 
+from deepspeech.frontend.augmentor.base import AugmentorBase
 from deepspeech.utils.dynamic_import import dynamic_import
 from deepspeech.utils.log import Log
 
@@ -209,6 +210,7 @@ class AugmentationPipeline():
     def _get_augmentor(self, augmentor_type, params):
         """Return an augmentation model by the type name, and pass in params."""
         class_obj = dynamic_import(augmentor_type, import_alias)
+        assert issubclass(class_obj, AugmentorBase)
         try:
             obj = class_obj(self._rng, **params)
         except Exception:
diff --git a/deepspeech/io/dataloader.py b/deepspeech/io/dataloader.py
index 3c4c2d5ef..15ab73157 100644
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
@@ -15,8 +15,8 @@ from paddle.io import DataLoader
 
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.io.batchfy import make_batchset
+from deepspeech.io.converter import CustomConverter
 from deepspeech.io.dataset import TransformDataset
-from deepspeech.io.reader import CustomConverter
 from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.utils.log import Log
 
@@ -46,7 +46,6 @@ class BatchDataLoader():
                  num_encs: int=1):
         self.json_file = json_file
         self.train_mode = train_mode
-
         self.use_sortagrad = sortagrad == -1 or sortagrad > 0
         self.batch_size = batch_size
         self.maxlen_in = maxlen_in
@@ -56,20 +55,17 @@ class BatchDataLoader():
         self.batch_frames_in = batch_frames_in
         self.batch_frames_out = batch_frames_out
         self.batch_frames_inout = batch_frames_inout
-
         self.subsampling_factor = subsampling_factor
         self.num_encs = num_encs
         self.preprocess_conf = preprocess_conf
-
         self.n_iter_processes = n_iter_processes
 
         # read json data
-        data_json = read_manifest(json_file)
-        logger.info(f"load {json_file} file.")
+        self.data_json = read_manifest(json_file)
 
         # make minibatch list (variable length)
-        self.data = make_batchset(
-            data_json,
+        self.minibaches = make_batchset(
+            self.data_json,
             batch_size,
             maxlen_in,
             maxlen_out,
@@ -83,9 +79,9 @@ class BatchDataLoader():
             batch_frames_inout=batch_frames_inout,
             iaxis=0,
             oaxis=0, )
-        logger.info(f"batchfy data {json_file}: {len(self.data)}.")
 
-        self.load = LoadInputsAndTargets(
+        # data reader
+        self.reader = LoadInputsAndTargets(
             mode="asr",
             load_output=True,
             preprocess_conf=preprocess_conf,
@@ -96,7 +92,7 @@ class BatchDataLoader():
         # Setup a converter
         if num_encs == 1:
             self.converter = CustomConverter(
-                subsampling_factor=subsampling_factor, dtype=dtype)
+                subsampling_factor=subsampling_factor, dtype=np.float32)
         else:
             assert NotImplementedError("not impl CustomConverterMulEnc.")
 
@@ -104,14 +100,39 @@ class BatchDataLoader():
         # actual bathsize is included in a list
         # default collate function converts numpy array to pytorch tensor
         # we used an empty collate function instead which returns list
-        self.train_loader = DataLoader(
-            dataset=TransformDataset(
-                self.data, lambda data: self.converter([self.load(data, return_uttid=True)])),
+        self.dataset = TransformDataset(
+            self.minibaches,
+            lambda data: self.converter([self.reader(data, return_uttid=True)]))
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
             batch_size=1,
             shuffle=not use_sortagrad if train_mode else False,
             collate_fn=lambda x: x[0],
             num_workers=n_iter_processes, )
-        logger.info(f"dataloader for {json_file}.")
 
     def __repr__(self):
-        return f"DataLoader {self.json_file}-{self.train_mode}-{self.use_sortagrad}"
+        echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
+        echo += f"train_mode: {self.train_mode}, "
+        echo += f"sortagrad: {self.use_sortagrad}, "
+        echo += f"batch_size: {self.batch_size}, "
+        echo += f"maxlen_in: {self.maxlen_in}, "
+        echo += f"maxlen_out: {self.maxlen_out}, "
+        echo += f"batch_count: {self.batch_count}, "
+        echo += f"batch_bins: {self.batch_bins}, "
+        echo += f"batch_frames_in: {self.batch_frames_in}, "
+        echo += f"batch_frames_out: {self.batch_frames_out}, "
+        echo += f"batch_frames_inout: {self.batch_frames_inout}, "
+        echo += f"subsampling_factor: {self.subsampling_factor}, "
+        echo += f"num_encs: {self.num_encs}, "
+        echo += f"num_workers: {self.n_iter_processes}, "
+        echo += f"file: {self.json_file}"
+        return echo
+
+    def __len__(self):
+        return len(self.dataloader)
+
+    def __iter__(self):
+        return self.dataloader.__iter__()
+
+    def __call__(self):
+        return self.__iter__()

From e4d6c1a91d5fc689980b133045fb8bedb8f30eaf Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 18 Aug 2021 07:54:56 +0000
Subject: [PATCH 16/17] add batchdataloader test

---
 .notebook/espnet_dataloader.ipynb | 480 ++++++++++++++++++++----------
 1 file changed, 327 insertions(+), 153 deletions(-)

diff --git a/.notebook/espnet_dataloader.ipynb b/.notebook/espnet_dataloader.ipynb
index 7abb138ff..1bfc13e3c 100644
--- a/.notebook/espnet_dataloader.ipynb
+++ b/.notebook/espnet_dataloader.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 147,
    "id": "extensive-venice",
    "metadata": {},
    "outputs": [
@@ -10,16 +10,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/workspace/zhanghui/DeepSpeech-2.x\n"
+      "/\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "'/workspace/zhanghui/DeepSpeech-2.x'"
+       "'/'"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 147,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 148,
    "id": "correct-window",
    "metadata": {},
    "outputs": [
@@ -50,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 149,
    "id": "exceptional-cheese",
    "metadata": {},
    "outputs": [],
@@ -60,53 +60,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 150,
    "id": "extraordinary-orleans",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n",
-      "register user softmax to paddle, remove this when fixed!\n",
-      "register user log_softmax to paddle, remove this when fixed!\n",
-      "register user sigmoid to paddle, remove this when fixed!\n",
-      "register user log_sigmoid to paddle, remove this when fixed!\n",
-      "register user relu to paddle, remove this when fixed!\n",
-      "override cat of paddle if exists or register, remove this when fixed!\n",
-      "override long of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "override eq of paddle if exists or register, remove this when fixed!\n",
-      "override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
-      "register user view to paddle.Tensor, remove this when fixed!\n",
-      "register user view_as to paddle.Tensor, remove this when fixed!\n",
-      "register user masked_fill to paddle.Tensor, remove this when fixed!\n",
-      "register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
-      "register user fill_ to paddle.Tensor, remove this when fixed!\n",
-      "register user repeat to paddle.Tensor, remove this when fixed!\n",
-      "register user softmax to paddle.Tensor, remove this when fixed!\n",
-      "register user sigmoid to paddle.Tensor, remove this when fixed!\n",
-      "register user relu to paddle.Tensor, remove this when fixed!\n",
-      "register user type_as to paddle.Tensor, remove this when fixed!\n",
-      "register user to to paddle.Tensor, remove this when fixed!\n",
-      "register user float to paddle.Tensor, remove this when fixed!\n",
-      "register user int to paddle.Tensor, remove this when fixed!\n",
-      "register user GLU to paddle.nn, remove this when fixed!\n",
-      "register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
-      "register user export to paddle.jit, remove this when fixed!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from deepspeech.frontend.utility import read_manifest"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 151,
    "id": "returning-lighter",
    "metadata": {},
    "outputs": [],
@@ -116,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 152,
    "id": "western-founder",
    "metadata": {},
    "outputs": [
@@ -158,7 +122,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 97,
    "id": "motivated-receptor",
    "metadata": {},
    "outputs": [],
@@ -638,10 +602,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 98,
    "id": "acquired-hurricane",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[INFO 2021/08/18 06:57:10 1445365138.py:284] use shuffled batch.\n",
+      "[INFO 2021/08/18 06:57:10 1445365138.py:286] # utts: 5542\n",
+      "[INFO 2021/08/18 06:57:10 1445365138.py:468] # minibatches: 555\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -686,7 +659,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 99,
    "id": "warming-malpractice",
    "metadata": {},
    "outputs": [
@@ -694,16 +667,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Collecting kaldiio\n",
-      "  Downloading kaldiio-2.17.2.tar.gz (24 kB)\n",
-      "Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages/numpy-1.21.2-py3.7-linux-x86_64.egg (from kaldiio) (1.21.2)\n",
-      "Building wheels for collected packages: kaldiio\n",
-      "  Building wheel for kaldiio (setup.py) ... \u001b[?25ldone\n",
-      "\u001b[?25h  Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24468 sha256=cd6e066764dcc8c24a9dfe3f7bd8acda18761a6fbcb024995729da8debdb466e\n",
-      "  Stored in directory: /root/.cache/pip/wheels/04/07/e8/45641287c59bf6ce41e22259f8680b521c31e6306cb88392ac\n",
-      "Successfully built kaldiio\n",
-      "Installing collected packages: kaldiio\n",
-      "Successfully installed kaldiio-2.17.2\n",
+      "Requirement already satisfied: kaldiio in ./DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages (2.17.2)\n",
+      "Requirement already satisfied: numpy in ./DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numpy-1.21.2-py3.7-linux-x86_64.egg (from kaldiio) (1.21.2)\n",
       "\u001b[33mWARNING: You are using pip version 20.3.3; however, version 21.2.4 is available.\n",
       "You should consider upgrading via the '/workspace/zhanghui/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
      ]
@@ -723,7 +688,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 100,
    "id": "superb-methodology",
    "metadata": {},
    "outputs": [],
@@ -1029,7 +994,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 101,
    "id": "monthly-muscle",
    "metadata": {},
    "outputs": [],
@@ -1047,7 +1012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 102,
    "id": "periodic-senegal",
    "metadata": {},
    "outputs": [],
@@ -1057,7 +1022,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 103,
    "id": "502d3f4d",
    "metadata": {},
    "outputs": [
@@ -1069,8 +1034,8 @@
       "2\n",
       "10\n",
       "10\n",
-      "(1763, 83) float32\n",
-      "(73,) int64\n"
+      "(1174, 83) float32\n",
+      "(29,) int64\n"
      ]
     }
    ],
@@ -1088,7 +1053,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 104,
    "id": "humanitarian-container",
    "metadata": {},
    "outputs": [],
@@ -1098,7 +1063,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 105,
    "id": "heard-prize",
    "metadata": {},
    "outputs": [
@@ -1106,7 +1071,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['1673-143396-0008', '1650-173552-0000', '2803-154320-0000', '6267-65525-0045', '7641-96684-0029', '5338-284437-0010', '8173-294714-0033', '5543-27761-0047', '8254-115543-0043', '6467-94831-0038'] 10\n",
+      "['4572-112383-0005', '6313-66125-0015', '251-137823-0022', '2277-149896-0030', '652-130726-0032', '5895-34615-0013', '1462-170138-0002', '777-126732-0008', '3660-172182-0021', '2277-149896-0027'] 10\n",
       "10\n"
      ]
     }
@@ -1118,7 +1083,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 106,
    "id": "convinced-animation",
    "metadata": {},
    "outputs": [],
@@ -1185,7 +1150,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 107,
    "id": "0b92ade5",
    "metadata": {},
    "outputs": [],
@@ -1195,7 +1160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 108,
    "id": "8dbd847c",
    "metadata": {},
    "outputs": [],
@@ -1205,7 +1170,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 109,
    "id": "31c085f4",
    "metadata": {},
    "outputs": [
@@ -1213,72 +1178,42 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['1673-143396-0008', '1650-173552-0000', '2803-154320-0000', '6267-65525-0045', '7641-96684-0029', '5338-284437-0010', '8173-294714-0033', '5543-27761-0047', '8254-115543-0043', '6467-94831-0038']\n",
-      "(10, 1763, 83)\n",
+      "['4572-112383-0005', '6313-66125-0015', '251-137823-0022', '2277-149896-0030', '652-130726-0032', '5895-34615-0013', '1462-170138-0002', '777-126732-0008', '3660-172182-0021', '2277-149896-0027']\n",
+      "(10, 1174, 83)\n",
       "(10,)\n",
-      "[1763 1214 1146  757  751  661  625  512  426  329]\n",
-      "(10, 73)\n",
-      "[[2896  621 4502 2176  404  198 3538  391  278  407  389 3719 4577  846\n",
-      "  4501  482 1004  103  116  178 4222  624 4689  176  459   89  101 3465\n",
-      "  3204 4502 2029 1834 2298  829 3366  278 4705 4925  482 2920 3204 2481\n",
-      "   448  627 1254  404   20  202   36 2047  627 2495 4504  481  479   99\n",
-      "    18 2079 4502 1628  202  226 4512 3267  210  278  483  234  367 4502\n",
-      "  2438 3204 1141]\n",
-      " [ 742 4501 4768 4569  742 4483 2495 4502 3040 3204 4502 3961 3204 3992\n",
-      "  3089 4832 4258  621 2391 4642 3218 4502 3439  235  270  313 2385 2833\n",
-      "   742 4502 3282  332    3  280 4237 3252  830 2387   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1]\n",
-      " [2099  278 4904 2302  124 4832 3158  482 2888 2495  482 2450  627 1560\n",
-      "  3158 4729  482 3514 3204 1027 3233 2391 2862  399  389 4962 2495  121\n",
-      "   221    7 2340 1216 1658   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1]\n",
-      " [2458 2659 1362    2  404 4975 4995  487 3079 2785 2371 3158  824 2603\n",
-      "  4832 2323  999 2603 4832 4156 4678  627 1784   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1]\n",
-      " [2458 2340 1661  101 4723 2138 4502 4690  463  332  251 2345 4534 4502\n",
-      "  2396  444 4501 2287  389 4531 4894 1466  959  389 1658 2584 4502 3681\n",
-      "   279 3204 4502 2228 3204 4502 4690  463  332  251   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1]\n",
-      " [2368 1248  208 4832 3158  482 1473 3401  999  482 4159 3838  389  478\n",
-      "  4572  404 3158 3063 1481  113 4499 4501 3204 4643    2  389 4111   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1]\n",
-      " [2882 2932 4329 1808 4577 4350 4577  482 1636    2  389 1841 3204 3079\n",
-      "  1091  389 3204 2816 2079 4172 4986 4990   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1]\n",
-      " [4869 2598 2603 1976   96  389  478    3 4031  721 4925 2263 1259 2598\n",
-      "  4508  653 4979 4925 2741  252   72  236   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1]\n",
-      " [2458 4447 4505  713  624 3207  206 4577 4502 2404 3837 3458 2812 4936\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1]\n",
-      " [1501 3897 2537  278 2601    2  404 2603  482 2235 3388   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
-      "    -1   -1   -1]]\n",
-      "[73 38 33 23 38 27 22 22 14 11]\n",
+      "[1174  821  716  628  597  473  463  441  419  358]\n",
+      "(10, 32)\n",
+      "[[4502 2404 4223 3204 4502  587 1018 3861 2932  713 2458 2916  253 4508\n",
+      "   627 1395  713 4504  957 2761  209 2967 3173 3918 2598 4100    3 2816\n",
+      "  4990   -1   -1   -1]\n",
+      " [1005  451  210  278 3411  206  482 2307  573 4502 3848 4577 4273 2388\n",
+      "  4444   89 4919  278 1264 4501 2371    3  139  113 2603 4962 3158 3325\n",
+      "  4577  814 4587 1422]\n",
+      " [2345 4144 2291  200  713 2345  532  999 2458 3076  545 2458 4832 3038\n",
+      "  4499  482 2812 1260 3080   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1]\n",
+      " [2345  832 4577 4920 4501 2345 2298 1236  381  288  389  101 2495 4172\n",
+      "  4843 3233 3245 4501 2345 2298 3987 4502 3023 3353 2345 1361 1635 2603\n",
+      "  4723 2371   -1   -1]\n",
+      " [4502 4207  432 3204 4502 2396  125  935  433 2598  483   18  327    2\n",
+      "   389  627 4512 2340  713  482 1981 4525 4031  269 2030 1340  101 2495\n",
+      "  4013 4844   -1   -1]\n",
+      " [4502 4892 3204 1892 3780  389  482 2774 3013   89  192 2495 4502 3475\n",
+      "   389   66  370  343  404   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1]\n",
+      " [2458 2314 4577 2340 2863 1254  303  269    2  389  932 2079 4577  299\n",
+      "   195 3233 4508    2   89  814 3144 1091 3204 3250 2193 3414   -1   -1\n",
+      "    -1   -1   -1   -1]\n",
+      " [2391 1785  443   78   39 4962 2340  829  599 4593  278 4681  202  407\n",
+      "   269  194  182 4577  482 4308   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1]\n",
+      " [ 627 4873 2175  363  202  404 1018 4577 4502 3412 4875 2286  107  122\n",
+      "  4832 2345 3896   89 2368   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1]\n",
+      " [ 481  174  474  599 1881 3252 2842  742 4502 2545  107   88 3204 4525\n",
+      "  4517   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1]]\n",
+      "[29 32 19 30 30 19 26 20 19 15]\n",
       "float32\n",
       "int64\n",
       "int64\n",
@@ -1302,42 +1237,281 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 110,
    "id": "72e9ba60",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 230,
+   "id": "64593e5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from paddle.io import DataLoader\n",
+    "\n",
+    "from deepspeech.frontend.utility import read_manifest\n",
+    "from deepspeech.io.batchfy import make_batchset\n",
+    "from deepspeech.io.converter import CustomConverter\n",
+    "from deepspeech.io.dataset import TransformDataset\n",
+    "from deepspeech.io.reader import LoadInputsAndTargets\n",
+    "from deepspeech.utils.log import Log\n",
+    "\n",
+    "\n",
+    "logger = Log(__name__).getlog()\n",
+    "\n",
+    "\n",
+    "class BatchDataLoader():\n",
+    "    def __init__(self,\n",
+    "                 json_file: str,\n",
+    "                 train_mode: bool,\n",
+    "                 sortagrad: bool=False,\n",
+    "                 batch_size: int=0,\n",
+    "                 maxlen_in: float=float('inf'),\n",
+    "                 maxlen_out: float=float('inf'),\n",
+    "                 minibatches: int=0,\n",
+    "                 mini_batch_size: int=1,\n",
+    "                 batch_count: str='auto',\n",
+    "                 batch_bins: int=0,\n",
+    "                 batch_frames_in: int=0,\n",
+    "                 batch_frames_out: int=0,\n",
+    "                 batch_frames_inout: int=0,\n",
+    "                 preprocess_conf=None,\n",
+    "                 n_iter_processes: int=1,\n",
+    "                 subsampling_factor: int=1,\n",
+    "                 num_encs: int=1):\n",
+    "        self.json_file = json_file\n",
+    "        self.train_mode = train_mode\n",
+    "        self.use_sortagrad = sortagrad == -1 or sortagrad > 0\n",
+    "        self.batch_size = batch_size\n",
+    "        self.maxlen_in = maxlen_in\n",
+    "        self.maxlen_out = maxlen_out\n",
+    "        self.batch_count = batch_count\n",
+    "        self.batch_bins = batch_bins\n",
+    "        self.batch_frames_in = batch_frames_in\n",
+    "        self.batch_frames_out = batch_frames_out\n",
+    "        self.batch_frames_inout = batch_frames_inout\n",
+    "        self.subsampling_factor = subsampling_factor\n",
+    "        self.num_encs = num_encs\n",
+    "        self.preprocess_conf = preprocess_conf\n",
+    "        self.n_iter_processes = n_iter_processes\n",
+    "\n",
+    "        \n",
+    "        # read json data\n",
+    "        self.data_json = read_manifest(json_file)\n",
+    "\n",
+    "        # make minibatch list (variable length)\n",
+    "        self.minibaches = make_batchset(\n",
+    "            self.data_json,\n",
+    "            batch_size,\n",
+    "            maxlen_in,\n",
+    "            maxlen_out,\n",
+    "            minibatches,  # for debug\n",
+    "            min_batch_size=mini_batch_size,\n",
+    "            shortest_first=self.use_sortagrad,\n",
+    "            count=batch_count,\n",
+    "            batch_bins=batch_bins,\n",
+    "            batch_frames_in=batch_frames_in,\n",
+    "            batch_frames_out=batch_frames_out,\n",
+    "            batch_frames_inout=batch_frames_inout,\n",
+    "            iaxis=0,\n",
+    "            oaxis=0, )\n",
+    "\n",
+    "        # data reader\n",
+    "        self.reader = LoadInputsAndTargets(\n",
+    "            mode=\"asr\",\n",
+    "            load_output=True,\n",
+    "            preprocess_conf=preprocess_conf,\n",
+    "            preprocess_args={\"train\":\n",
+    "                             train_mode},  # Switch the mode of preprocessing\n",
+    "        )\n",
+    "\n",
+    "        # Setup a converter\n",
+    "        if num_encs == 1:\n",
+    "            self.converter = CustomConverter(\n",
+    "                subsampling_factor=subsampling_factor, dtype=np.float32)\n",
+    "        else:\n",
+    "            assert NotImplementedError(\"not impl CustomConverterMulEnc.\")\n",
+    "\n",
+    "        # hack to make batchsize argument as 1\n",
+    "        # actual bathsize is included in a list\n",
+    "        # default collate function converts numpy array to pytorch tensor\n",
+    "        # we used an empty collate function instead which returns list\n",
+    "        self.dataset = TransformDataset(self.minibaches, \n",
+    "                                        lambda data: self.converter([self.reader(data, return_uttid=True)]))\n",
+    "        self.dataloader = DataLoader(\n",
+    "            dataset=self.dataset,\n",
+    "            batch_size=1,\n",
+    "            shuffle=not use_sortagrad if train_mode else False,\n",
+    "            collate_fn=lambda x: x[0],\n",
+    "            num_workers=n_iter_processes, )\n",
+    "\n",
+    "    def __repr__(self):\n",
+    "        echo = f\"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> \"\n",
+    "        echo += f\"train_mode: {self.train_mode}, \"\n",
+    "        echo += f\"sortagrad: {self.use_sortagrad}, \"\n",
+    "        echo += f\"batch_size: {self.batch_size}, \"\n",
+    "        echo += f\"maxlen_in: {self.maxlen_in}, \"\n",
+    "        echo += f\"maxlen_out: {self.maxlen_out}, \"\n",
+    "        echo += f\"batch_count: {self.batch_count}, \"\n",
+    "        echo += f\"batch_bins: {self.batch_bins}, \"\n",
+    "        echo += f\"batch_frames_in: {self.batch_frames_in}, \"\n",
+    "        echo += f\"batch_frames_out: {self.batch_frames_out}, \"\n",
+    "        echo += f\"batch_frames_inout: {self.batch_frames_inout}, \"\n",
+    "        echo += f\"subsampling_factor: {self.subsampling_factor}, \"\n",
+    "        echo += f\"num_encs: {self.num_encs}, \"\n",
+    "        echo += f\"num_workers: {self.n_iter_processes}, \"\n",
+    "        echo += f\"file: {self.json_file}\"\n",
+    "        return echo\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.dataloader)\n",
+    "    \n",
+    "    def __iter__(self):\n",
+    "        return self.dataloader.__iter__()\n",
+    "    \n",
+    "    def __call__(self):\n",
+    "        return self.__iter__()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 231,
+   "id": "fcea3fd0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[INFO 2021/08/18 07:42:23 batchfy.py:399] count is auto detected as seq\n",
+      "[INFO 2021/08/18 07:42:23 batchfy.py:423] # utts: 5542\n",
+      "[INFO 2021/08/18 07:42:23 batchfy.py:466] # minibatches: 278\n"
+     ]
+    }
+   ],
    "source": [
-    "from pathlib import Path"
+    "train = BatchDataLoader(dev_data, True, batch_size=20)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
-   "id": "64593e5f",
+   "execution_count": 232,
+   "id": "e2a2c9a8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "278\n",
+      "['__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'auto_collate_batch', 'batch_sampler', 'batch_size', 'collate_fn', 'dataset', 'dataset_kind', 'feed_list', 'from_dataset', 'from_generator', 'num_workers', 'pin_memory', 'places', 'return_list', 'timeout', 'use_buffer_reader', 'use_shared_memory', 'worker_init_fn']\n",
+      "<__main__.BatchDataLoader object at 0x7fdddba35470> train_mode: True, sortagrad: False, batch_size: 20, maxlen_in: inf, maxlen_out: inf, batch_count: auto, batch_bins: 0, batch_frames_in: 0, batch_frames_out: 0, batch_frames_inout: 0, subsampling_factor: 1, num_encs: 1, num_workers: 1, file: /workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev\n",
+      "278\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(train.dataloader))\n",
+    "print(dir(train.dataloader))\n",
+    "print(train)\n",
+    "print(len(train))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 220,
+   "id": "a5ba7d6e",
    "metadata": {},
    "outputs": [
     {
-     "ename": "AttributeError",
-     "evalue": "'str' object has no attribute 'stat'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "\u001b[0;32m/tmp/ipykernel_48616/3505477735.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'xxxxxxxx'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mPath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m/usr/local/lib/python3.7/pathlib.py\u001b[0m in \u001b[0;36mis_file\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1342\u001b[0m         \"\"\"\n\u001b[1;32m   1343\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1344\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mS_ISREG\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mst_mode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1345\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1346\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merrno\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mENOENT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mENOTDIR\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'stat'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['7601-101619-0003', '1255-138279-0000', '1272-128104-0004', '6123-59150-0027', '2078-142845-0025', '7850-73752-0018', '4570-24733-0004', '2506-169427-0002', '7601-101619-0004', '3170-137482-0000', '6267-53049-0019', '4570-14911-0009', '174-168635-0018', '7601-291468-0004', '3576-138058-0022', '1919-142785-0007', '6467-62797-0007', '4153-61735-0005', '1686-142278-0003', '2506-169427-0000']\n",
+      "Tensor(shape=[20, 2961, 83], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[[-1.99415934, -1.80315673, -1.88801885, ...,  0.86933994, -0.59853148,  0.02596200],\n",
+      "         [-1.95346808, -1.84891188, -2.17492867, ...,  0.83640492, -0.59853148, -0.11333394],\n",
+      "         [-2.27899861, -2.21495342, -2.58480024, ...,  0.91874266, -0.59853148, -0.31453922],\n",
+      "         ...,\n",
+      "         [-2.64522028, -2.35221887, -2.91269732, ...,  1.48994756, -0.16100442,  0.36646330],\n",
+      "         [-2.40107250, -2.21495342, -2.37986445, ...,  1.44072104, -0.13220564,  0.12656468],\n",
+      "         [-2.15692472, -1.89466715, -2.25690317, ...,  1.31273174, -0.09620714, -0.15202725]],\n",
+      "\n",
+      "        [[-0.28859532, -0.29033494, -0.86576819, ...,  1.37753224, -0.30570769,  0.25806731],\n",
+      "         [-0.20149794, -0.17814466, -0.59891301, ...,  1.35188794, -0.30570769, -0.02964944],\n",
+      "         [-0.34947991, -0.33597648, -0.96877253, ...,  1.38394332, -0.30570769, -0.38376236],\n",
+      "         ...,\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ]],\n",
+      "\n",
+      "        [[-0.44914246, -0.33902276, -0.78237975, ...,  1.38218808,  0.29214793, -0.16815147],\n",
+      "         [-0.55490732, -0.41596055, -0.84425378, ...,  1.34530187,  0.25002354, -0.04004869],\n",
+      "         [-0.83694696, -0.62112784, -1.07112527, ...,  1.19160914,  0.20789915,  0.37984371],\n",
+      "         ...,\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ]],\n",
+      "\n",
+      "        ...,\n",
+      "\n",
+      "        [[-1.24343657, -0.94188881, -1.41092563, ...,  0.96716309,  0.60345763,  0.15360183],\n",
+      "         [-1.19466043, -0.80585432, -0.49723154, ...,  1.06735480,  0.60345763,  0.14511746],\n",
+      "         [-0.94079566, -0.59330046, -0.40948665, ...,  0.82244170,  0.55614340,  0.28086722],\n",
+      "         ...,\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ]],\n",
+      "\n",
+      "        [[ 0.21757117,  0.11361472, -0.33262897, ...,  0.76338506, -0.10711290, -0.57754958],\n",
+      "         [-1.00205481, -0.61152041, -0.47124696, ...,  1.11897349, -0.10711290,  0.24931324],\n",
+      "         [-1.03929281, -1.20336759, -1.16433656, ...,  0.88888687, -0.10711290, -0.04115745],\n",
+      "         ...,\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ]],\n",
+      "\n",
+      "        [[-1.25289667, -1.05046368, -0.82881606, ...,  1.23991334,  0.61702502,  0.05275881],\n",
+      "         [-1.19659519, -0.78677225, -0.80407262, ...,  1.27644968,  0.61702502, -0.35079369],\n",
+      "         [-1.49687004, -1.01750231, -0.82881606, ...,  1.29106426,  0.65006059,  0.17958963],\n",
+      "         ...,\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ],\n",
+      "         [ 0.        ,  0.        ,  0.        , ...,  0.        ,  0.        ,  0.        ]]])\n",
+      "Tensor(shape=[20], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [2961, 2948, 2938, 2907, 2904, 2838, 2832, 2819, 2815, 2797, 2775, 2710, 2709, 2696, 2688, 2661, 2616, 2595, 2589, 2576])\n",
+      "Tensor(shape=[20, 133], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[3098, 1595,  389, ..., -1  , -1  , -1  ],\n",
+      "        [2603, 4832,  482, ..., -1  , -1  , -1  ],\n",
+      "        [2796,  303,  269, ..., -1  , -1  , -1  ],\n",
+      "        ...,\n",
+      "        [3218, 3673,  206, ..., -1  , -1  , -1  ],\n",
+      "        [2371, 4832, 4031, ..., -1  , -1  , -1  ],\n",
+      "        [2570, 2433, 4285, ..., -1  , -1  , -1  ]])\n",
+      "Tensor(shape=[20], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [80 , 83 , 102, 133, 82 , 102, 71 , 91 , 68 , 81 , 86 , 67 , 71 , 95 , 65 , 88 , 97 , 98 , 89 , 72 ])\n"
      ]
     }
    ],
    "source": [
-    "s='xxxxxxxx'\n",
-    "Path.is_file(s)"
+    "for batch in train:\n",
+    "    utts, xs, ilens, ys, olens = batch\n",
+    "    print(utts)\n",
+    "    print(xs)\n",
+    "    print(ilens)\n",
+    "    print(ys)\n",
+    "    print(olens)\n",
+    "    break"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fcea3fd0",
+   "id": "3c974a1e",
    "metadata": {},
    "outputs": [],
    "source": []

From b12b0183860a5cd0b7b5dd221592876e377aaebd Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 18 Aug 2021 08:13:22 +0000
Subject: [PATCH 17/17] fix docstring

---
 deepspeech/io/batchfy.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/deepspeech/io/batchfy.py b/deepspeech/io/batchfy.py
index 54c6f0e14..de29d0546 100644
--- a/deepspeech/io/batchfy.py
+++ b/deepspeech/io/batchfy.py
@@ -337,10 +337,10 @@ def make_batchset(
 
     if utts have "category" value,
 
-        >>> data = {'utt1': {'category': 'A', 'input': ...},
-        ...         'utt2': {'category': 'B', 'input': ...},
-        ...         'utt3': {'category': 'B', 'input': ...},
-        ...         'utt4': {'category': 'A', 'input': ...}}
+        >>> data = [{'category': 'A', 'input': ..., 'utt':'utt1'},
+        ...         {'category': 'B', 'input': ..., 'utt':'utt2'},
+        ...         {'category': 'B', 'input': ..., 'utt':'utt3'},
+        ...         {'category': 'A', 'input': ..., 'utt':'utt4'}]
         >>> make_batchset(data, batchsize=2, ...)
         [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]