From d75cf8963030648fc682c849002d379797e9103d Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Sun, 26 Sep 2021 21:13:30 +0800
Subject: [PATCH 01/11] Update released_model.md

---
 docs/src/released_model.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/src/released_model.md b/docs/src/released_model.md
index 61fd1560..50670aaf 100644
--- a/docs/src/released_model.md
+++ b/docs/src/released_model.md
@@ -1,21 +1,21 @@
 # Released Models
 
 ## Acoustic Model Released in paddle 2.X
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
-[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 | 151 h
-[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 | 151 h
-[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 | 151 h
-[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 | 151 h
-[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0325 | 960 h
-[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0544 | 960 h
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
+:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
+[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
+[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
+[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
+[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
+[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
+[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
 
 ## Acoustic Model Transformed from paddle 1.8
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
-[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 | 151 h|
-[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers | 0.0685| 960 h|
-[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers | 0.0541 | 8628 h|
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
+:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
+[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
+[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
+[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
 
 
 

From 9c37d10992a2237730f5c3f8fc13ec584dc87b09 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 27 Sep 2021 06:53:57 +0000
Subject: [PATCH 02/11] optimize the log

---
 deepspeech/exps/u2/model.py    | 1 +
 deepspeech/training/trainer.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 5cb0962a..1afd9b10 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -216,6 +216,7 @@ class U2Trainer(Trainer):
                             msg += f"{v:>.8f}" if isinstance(v,
                                                              float) else f"{v}"
                             msg += ","
+                        msg = msg[:-1]  # remove the last ","
                         if (batch_index + 1
                             ) % self.config.training.log_interval == 0:
                             logger.info(msg)
diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py
index 15259f0e..35b1690b 100644
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@@ -263,6 +263,7 @@ class Trainer():
                             msg += f"{v:>.8f}" if isinstance(v,
                                                              float) else f"{v}"
                             msg += ","
+                        msg = msg[:-1]  # remove the last ","
                         logger.info(msg)
                         data_start_time = time.time()
                 except Exception as e:

From f7d7e70cb24338e61e921240c18c20bc88456150 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 27 Sep 2021 10:50:20 +0000
Subject: [PATCH 03/11] more ctc check; valid dataloader with num workers

---
 deepspeech/exps/u2/model.py | 4 +++-
 deepspeech/modules/ctc.py   | 2 +-
 deepspeech/modules/loss.py  | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 5cb0962a..5cf8866c 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -243,6 +243,7 @@ class U2Trainer(Trainer):
                 self.visualizer.add_scalars(
                     'epoch', {'cv_loss': cv_loss,
                               'lr': self.lr_scheduler()}, self.epoch)
+
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
@@ -291,7 +292,8 @@ class U2Trainer(Trainer):
             batch_size=config.collator.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
+            num_workers=config.collator.num_workers, )
 
         # test dataset, return raw text
         config.data.manifest = config.data.test_manifest
diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py
index 11ce871f..551bbf67 100644
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@@ -49,7 +49,7 @@ class CTCDecoder(nn.Layer):
             dropout_rate (float): dropout rate (0.0 ~ 1.0)
             reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
             batch_average (bool): do batch dim wise average.
-            grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
+            grad_norm_type (str): one of 'instance', 'batch', 'frame', None.
         """
         assert check_argument_types()
         super().__init__()
diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py
index 7d24e170..1f33e512 100644
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@@ -49,6 +49,8 @@ class CTCLoss(nn.Layer):
             self.norm_by_batchsize = True
         elif grad_norm_type == 'frame':
             self.norm_by_total_logits_len = True
+        else:
+            raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}")
 
     def forward(self, logits, ys_pad, hlens, ys_lens):
         """Compute CTC loss.

From 1a46125175b9428bd4c481f79117598330bad535 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 28 Sep 2021 05:46:29 +0000
Subject: [PATCH 04/11] add bin for hub

---
 deepspeech/exps/deepspeech2/bin/test_hub.py | 191 ++++++++++++++++++++
 examples/aishell/s0/local/test_hub.sh       |  36 ++++
 examples/aishell/s0/run.sh                  |   8 +
 3 files changed, 235 insertions(+)
 create mode 100644 deepspeech/exps/deepspeech2/bin/test_hub.py
 create mode 100755 examples/aishell/s0/local/test_hub.sh

diff --git a/deepspeech/exps/deepspeech2/bin/test_hub.py b/deepspeech/exps/deepspeech2/bin/test_hub.py
new file mode 100644
index 00000000..cbda3b4c
--- /dev/null
+++ b/deepspeech/exps/deepspeech2/bin/test_hub.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for DeepSpeech2 model."""
+import os
+import sys
+from pathlib import Path
+
+import paddle
+
+from deepspeech.exps.deepspeech2.config import get_cfg_defaults
+from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.models.ds2 import DeepSpeech2Model
+from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils import mp_tools
+from deepspeech.utils.checkpoint import Checkpoint
+from deepspeech.utils.log import Log
+from deepspeech.utils.utility import print_arguments
+from deepspeech.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+class DeepSpeech2Tester_hub():
+    def __init__(self, config, args):
+        self.args = args
+        self.config = config
+        self.audio_file = args.audio_file
+        self.collate_fn_test = SpeechCollator.from_config(config)
+        self._text_featurizer = TextFeaturizer(
+            unit_type=config.collator.unit_type, vocab_filepath=None)
+
+    def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
+        result_transcripts = self.model.decode(
+            audio,
+            audio_len,
+            vocab_list,
+            decoding_method=cfg.decoding_method,
+            lang_model_path=cfg.lang_model_path,
+            beam_alpha=cfg.alpha,
+            beam_beta=cfg.beta,
+            beam_size=cfg.beam_size,
+            cutoff_prob=cfg.cutoff_prob,
+            cutoff_top_n=cfg.cutoff_top_n,
+            num_processes=cfg.num_proc_bsearch)
+        #replace the '<space>' with ' '
+        result_transcripts = [
+            self._text_featurizer.detokenize(sentence)
+            for sentence in result_transcripts
+        ]
+
+        return result_transcripts
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        self.model.eval()
+        cfg = self.config
+        audio_file = self.audio_file
+        collate_fn_test = self.collate_fn_test
+        audio, _ = collate_fn_test.process_utterance(
+            audio_file=audio_file, transcript=" ")
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+        vocab_list = collate_fn_test.vocab_list
+        result_transcripts = self.compute_result_transcripts(
+            audio, audio_len, vocab_list, cfg.decoding)
+        logger.info("result_transcripts: " + result_transcripts[0])
+
+    def run_test(self):
+        self.resume()
+        try:
+            self.test()
+        except KeyboardInterrupt:
+            exit(-1)
+
+    def setup(self):
+        """Setup the experiment.
+        """
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+
+        self.setup_output_dir()
+        self.setup_checkpointer()
+
+        self.setup_model()
+
+    def setup_output_dir(self):
+        """Create a directory used for output.
+        """
+        # output dir
+        if self.args.output:
+            output_dir = Path(self.args.output).expanduser()
+            output_dir.mkdir(parents=True, exist_ok=True)
+        else:
+            output_dir = Path(
+                self.args.checkpoint_path).expanduser().parent.parent
+            output_dir.mkdir(parents=True, exist_ok=True)
+        self.output_dir = output_dir
+
+    def setup_model(self):
+        config = self.config.clone()
+        with UpdateConfig(config):
+            config.model.feat_size = self.collate_fn_test.feature_size
+            config.model.dict_size = self.collate_fn_test.vocab_size
+
+        if self.args.model_type == 'offline':
+            model = DeepSpeech2Model.from_config(config.model)
+        elif self.args.model_type == 'online':
+            model = DeepSpeech2ModelOnline.from_config(config.model)
+        else:
+            raise Exception("wrong model type")
+
+        self.model = model
+
+    def setup_checkpointer(self):
+        """Create a directory used to save checkpoints into.
+
+        It is "checkpoints" inside the output directory.
+        """
+        # checkpoint dir
+        checkpoint_dir = self.output_dir / "checkpoints"
+        checkpoint_dir.mkdir(exist_ok=True)
+
+        self.checkpoint_dir = checkpoint_dir
+
+        self.checkpoint = Checkpoint(
+            kbest_n=self.config.training.checkpoint.kbest_n,
+            latest_n=self.config.training.checkpoint.latest_n)
+
+    def resume(self):
+        """Resume from the checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+        """
+        params_path = self.args.checkpoint_path + ".pdparams"
+        model_dict = paddle.load(params_path)
+        self.model.set_state_dict(model_dict)
+
+
+def main_sp(config, args):
+    exp = DeepSpeech2Tester_hub(config, args)
+    exp.setup()
+    exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument("--model_type")
+    parser.add_argument("--audio_file")
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    if args.model_type is None:
+        args.model_type = 'offline'
+    if not os.path.isfile(args.audio_file):
+        print("Please input the audio file path")
+        sys.exit(-1)
+    print("model_type:{}".format(args.model_type))
+
+    # https://yaml.org/type/float.html
+    config = get_cfg_defaults(args.model_type)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
diff --git a/examples/aishell/s0/local/test_hub.sh b/examples/aishell/s0/local/test_hub.sh
new file mode 100755
index 00000000..d01496c4
--- /dev/null
+++ b/examples/aishell/s0/local/test_hub.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+model_type=$3
+audio_file=$4
+
+# download language model
+bash local/download_lm_ch.sh
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+python3 -u ${BIN_DIR}/test_hub.py \
+--nproc ${ngpu} \
+--config ${config_path} \
+--result_file ${ckpt_prefix}.rsl \
+--checkpoint_path ${ckpt_prefix} \
+--model_type ${model_type} \
+--audio_file ${audio_file}
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh
index 71191c3a..83846ada 100755
--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
@@ -15,6 +15,8 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 
+audio_file="data/tmp.wav"
+
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
     bash ./local/data.sh || exit -1
@@ -44,3 +46,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # test export ckpt avg_n
     CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
+
+# Optionally, you can add LM and test it with runtime.
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
+fi

From f628e218167a17af501d4e84c3a20d5ad804f629 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 28 Sep 2021 10:33:35 +0000
Subject: [PATCH 05/11] refactor kaldi/tarfile loader, st and asr collator

---
 deepspeech/exps/u2_st/model.py                |  23 +-
 deepspeech/frontend/audio.py                  |  20 +-
 .../frontend/featurizer/speech_featurizer.py  |  77 +--
 deepspeech/frontend/speech.py                 |  11 +-
 deepspeech/frontend/utility.py                |  46 ++
 deepspeech/io/collator.py                     | 359 +++++-----
 deepspeech/io/collator_st.py                  | 631 ------------------
 deepspeech/io/reader.py                       |  16 +
 8 files changed, 322 insertions(+), 861 deletions(-)
 delete mode 100644 deepspeech/io/collator_st.py

diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py
index e4e70292..f5a514c7 100644
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -28,10 +28,8 @@ from paddle import distributed as dist
 from paddle.io import DataLoader
 from yacs.config import CfgNode
 
-from deepspeech.io.collator_st import KaldiPrePorocessedCollator
-from deepspeech.io.collator_st import SpeechCollator
-from deepspeech.io.collator_st import TripletKaldiPrePorocessedCollator
-from deepspeech.io.collator_st import TripletSpeechCollator
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.io.collator import TripletSpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.io.dataset import TripletManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
@@ -258,22 +256,13 @@ class U2STTrainer(Trainer):
         config.data.manifest = config.data.dev_manifest
         dev_dataset = Dataset.from_config(config)
 
-        if config.collator.raw_wav:
-            if config.model.model_conf.asr_weight > 0.:
-                Collator = TripletSpeechCollator
-                TestCollator = SpeechCollator
-            else:
-                TestCollator = Collator = SpeechCollator
-            # Not yet implement the mtl loader for raw_wav.
+        if config.model.model_conf.asr_weight > 0.:
+            Collator = TripletSpeechCollator
+            TestCollator = SpeechCollator
         else:
-            if config.model.model_conf.asr_weight > 0.:
-                Collator = TripletKaldiPrePorocessedCollator
-                TestCollator = KaldiPrePorocessedCollator
-            else:
-                TestCollator = Collator = KaldiPrePorocessedCollator
+            TestCollator = Collator = SpeechCollator
 
         collate_fn_train = Collator.from_config(config)
-
         config.collator.augmentation_config = ""
         collate_fn_dev = Collator.from_config(config)
 
diff --git a/deepspeech/frontend/audio.py b/deepspeech/frontend/audio.py
index ffdcd4b3..13dc3a44 100644
--- a/deepspeech/frontend/audio.py
+++ b/deepspeech/frontend/audio.py
@@ -24,8 +24,10 @@ import soundfile
 import soxbindings as sox
 from scipy import signal
 
+from .utility import subfile_from_tar
 
-class AudioSegment(object):
+
+class AudioSegment():
     """Monaural audio segment abstraction.
 
     :param samples: Audio samples [num_samples x num_channels].
@@ -68,16 +70,20 @@ class AudioSegment(object):
                                 self.duration, self.rms_db))
 
     @classmethod
-    def from_file(cls, file):
+    def from_file(cls, file, infos=None):
         """Create audio segment from audio file.
-        
-        :param filepath: Filepath or file object to audio file.
-        :type filepath: str|file
-        :return: Audio segment instance.
-        :rtype: AudioSegment
+
+        Args:
+            filepath (str|file): Filepath or file object to audio file.
+            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
+
+        Returns:
+            AudioSegment: Audio segment instance.
         """
         if isinstance(file, str) and re.findall(r".seqbin_\d+$", file):
             return cls.from_sequence_file(file)
+        elif isinstance(file, str) and file.startswith('tar:'):
+            return cls.from_file(subfile_from_tar(file, infos))
         else:
             samples, sample_rate = soundfile.read(file, dtype='float32')
             return cls(samples, sample_rate)
diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py
index 5082850d..f9f7d7c2 100644
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -64,8 +64,12 @@ class SpeechFeaturizer():
                  target_sample_rate=16000,
                  use_dB_normalization=True,
                  target_dB=-20,
-                 dither=1.0):
-        self._audio_featurizer = AudioFeaturizer(
+                 dither=1.0,
+                 maskctc=False):
+        self.stride_ms = stride_ms
+        self.window_ms = window_ms
+
+        self.audio_feature = AudioFeaturizer(
             specgram_type=specgram_type,
             feat_dim=feat_dim,
             delta_delta=delta_delta,
@@ -77,8 +81,12 @@ class SpeechFeaturizer():
             use_dB_normalization=use_dB_normalization,
             target_dB=target_dB,
             dither=dither)
-        self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath,
-                                               spm_model_prefix)
+
+        self.text_feature = TextFeaturizer(
+            unit_type=unit_type,
+            vocab_filepath=vocab_filepath,
+            spm_model_prefix=spm_model_prefix,
+            maskctc=maskctc)
 
     def featurize(self, speech_segment, keep_transcription_text):
         """Extract features for speech segment.
@@ -94,60 +102,33 @@ class SpeechFeaturizer():
         Returns:
             tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
         """
-        spec_feature = self._audio_featurizer.featurize(speech_segment)
+        spec_feature = self.audio_feature.featurize(speech_segment)
+
         if keep_transcription_text:
             return spec_feature, speech_segment.transcript
+
         if speech_segment.has_token:
             text_ids = speech_segment.token_ids
         else:
-            text_ids = self._text_featurizer.featurize(
-                speech_segment.transcript)
+            text_ids = self.text_feature.featurize(speech_segment.transcript)
         return spec_feature, text_ids
 
-    @property
-    def vocab_size(self):
-        """Return the vocabulary size.
-        Returns:
-            int: Vocabulary size.
-        """
-        return self._text_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        """Return the vocabulary in list.
-        Returns:
-            List[str]: 
-        """
-        return self._text_featurizer.vocab_list
+    def text_featurize(self, text, keep_transcription_text):
+        """Extract features for speech segment.
 
-    @property
-    def vocab_dict(self):
-        """Return the vocabulary in dict.
-        Returns:
-            Dict[str, int]: 
-        """
-        return self._text_featurizer.vocab_dict
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, keep the original text or convert text string
+           to a list of token indices in char-level.
 
-    @property
-    def feature_size(self):
-        """Return the audio feature size.
-        Returns:
-            int: audio feature size.
-        """
-        return self._audio_featurizer.feature_size
+        Args:
+            text (str): text.
+            keep_transcription_text (bool): True, keep transcript text, False, token ids
 
-    @property
-    def stride_ms(self):
-        """time length in `ms` unit per frame
         Returns:
-            float: time(ms)/frame
+            (str|List[int]): text, or list of token indices.
         """
-        return self._audio_featurizer.stride_ms
+        if keep_transcription_text:
+            return text
 
-    @property
-    def text_feature(self):
-        """Return the text feature object.
-        Returns:
-            TextFeaturizer: object.
-        """
-        return self._text_featurizer
+        text_ids = self.text_feature.featurize(text)
+        return text_ids
diff --git a/deepspeech/frontend/speech.py b/deepspeech/frontend/speech.py
index e58795c0..9eed9725 100644
--- a/deepspeech/frontend/speech.py
+++ b/deepspeech/frontend/speech.py
@@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment):
         return not self.__eq__(other)
 
     @classmethod
-    def from_file(cls, filepath, transcript, tokens=None, token_ids=None):
+    def from_file(cls,
+                  filepath,
+                  transcript,
+                  tokens=None,
+                  token_ids=None,
+                  infos=None):
         """Create speech segment from audio file and corresponding transcript.
 
         Args:
@@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment):
             transcript (str): Transcript text for the speech.
             tokens (List[str], optional): text tokens. Defaults to None.
             token_ids (List[int], optional): text token ids. Defaults to None.
+            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
 
         Returns:
             SpeechSegment: Speech segment instance.
         """
-
-        audio = AudioSegment.from_file(filepath)
+        audio = AudioSegment.from_file(filepath, infos)
         return cls(audio.samples, audio.sample_rate, transcript, tokens,
                    token_ids)
 
diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py
index 3a972b50..2a581232 100644
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -14,6 +14,7 @@
 """Contains data helper functions."""
 import json
 import math
+import tarfile
 from typing import List
 from typing import Optional
 from typing import Text
@@ -112,6 +113,51 @@ def read_manifest(
     return manifest
 
 
+# Tar File read
+TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
+
+
+def parse_tar(file):
+    """Parse a tar file to get a tarfile object
+    and a map containing tarinfoes
+    """
+    result = {}
+    f = tarfile.open(file)
+    for tarinfo in f.getmembers():
+        result[tarinfo.name] = tarinfo
+    return f, result
+
+
+def subfile_from_tar(file, local_data=None):
+    """Get subfile object from tar.
+
+    tar:tarpath#filename
+
+    It will return a subfile object from tar file
+    and cached tar file info for next reading request.
+    """
+    tarpath, filename = file.split(':', 1)[1].split('#', 1)
+
+    if local_data is None:
+        local_data = TarLocalData(tar2info={}, tar2object={})
+
+    assert isinstance(local_data, TarLocalData)
+
+    if 'tar2info' not in local_data.__dict__:
+        local_data.tar2info = {}
+    if 'tar2object' not in local_data.__dict__:
+        local_data.tar2object = {}
+
+    if tarpath not in local_data.tar2info:
+        fobj, infos = parse_tar(tarpath)
+        local_data.tar2info[tarpath] = infos
+        local_data.tar2object[tarpath] = fobj
+    else:
+        fobj = local_data.tar2object[tarpath]
+        infos = local_data.tar2info[tarpath]
+    return fobj.extractfile(infos[filename])
+
+
 def rms_to_db(rms: float):
     """Root Mean Square to dB.
 
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index 15b89ab9..c5c0a414 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
-from collections import namedtuple
 from typing import Optional
 
 import numpy as np
@@ -23,96 +22,17 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
+from deepspeech.frontend.utility import TarLocalData
+from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log
 
-__all__ = ["SpeechCollator"]
+__all__ = ["SpeechCollator", "TripletSpeechCollator"]
 
 logger = Log(__name__).getlog()
 
-# namedtupe need global for pickle.
-TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
-
-
-class SpeechCollator():
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                augmentation_config="",
-                random_seed=0,
-                mean_std_filepath="",
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-                feat_dim=0,  # 'mfcc', 'fbank'
-                delta_delta=False,  # 'mfcc', 'fbank'
-                stride_ms=10.0,  # ms
-                window_ms=20.0,  # ms
-                n_fft=None,  # fft points
-                max_freq=None,  # None for samplerate/2
-                target_sample_rate=16000,  # target sample rate
-                use_dB_normalization=True,
-                target_dB=-20,
-                dither=1.0,  # feature dither
-                keep_transcription_text=False))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
-    @classmethod
-    def from_config(cls, config):
-        """Build a SpeechCollator object from a config.
-
-        Args:
-            config (yacs.config.CfgNode): configs object.
-
-        Returns:
-            SpeechCollator: collator object.
-        """
-        assert 'augmentation_config' in config.collator
-        assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.collator
-        assert 'vocab_filepath' in config.collator
-        assert 'specgram_type' in config.collator
-        assert 'n_fft' in config.collator
-        assert config.collator
-
-        if isinstance(config.collator.augmentation_config, (str, bytes)):
-            if config.collator.augmentation_config:
-                aug_file = io.open(
-                    config.collator.augmentation_config,
-                    mode='r',
-                    encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.collator.augmentation_config
-            assert isinstance(aug_file, io.StringIO)
-
-        speech_collator = cls(
-            aug_file=aug_file,
-            random_seed=0,
-            mean_std_filepath=config.collator.mean_std_filepath,
-            unit_type=config.collator.unit_type,
-            vocab_filepath=config.collator.vocab_filepath,
-            spm_model_prefix=config.collator.spm_model_prefix,
-            specgram_type=config.collator.specgram_type,
-            feat_dim=config.collator.feat_dim,
-            delta_delta=config.collator.delta_delta,
-            stride_ms=config.collator.stride_ms,
-            window_ms=config.collator.window_ms,
-            n_fft=config.collator.n_fft,
-            max_freq=config.collator.max_freq,
-            target_sample_rate=config.collator.target_sample_rate,
-            use_dB_normalization=config.collator.use_dB_normalization,
-            target_dB=config.collator.target_dB,
-            dither=config.collator.dither,
-            keep_transcription_text=config.collator.keep_transcription_text)
-        return speech_collator
 
+class SpeechCollatorBase():
     def __init__(
             self,
             aug_file,
@@ -121,7 +41,7 @@ class SpeechCollator():
             spm_model_prefix,
             random_seed=0,
             unit_type="char",
-            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+            spectrum_type='linear',  # 'linear', 'mfcc', 'fbank'
             feat_dim=0,  # 'mfcc', 'fbank'
             delta_delta=False,  # 'mfcc', 'fbank'
             stride_ms=10.0,  # ms
@@ -146,7 +66,7 @@ class SpeechCollator():
             n_fft (int, optional): fft points for rfft. Defaults to None.
             max_freq (int, optional): max cut freq. Defaults to None.
             target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
-            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
+            spectrum_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
             feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
             delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
             use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
@@ -159,23 +79,27 @@ class SpeechCollator():
         Padding audio features with zeros to make them have the same shape (or
         a user-defined shape) within one batch.
         """
-        self._keep_transcription_text = keep_transcription_text
+        self.keep_transcription_text = keep_transcription_text
+        self.stride_ms = stride_ms
+        self.window_ms = window_ms
+        self.feat_dim = feat_dim
+
+        self.loader = LoadInputsAndTargets()
 
+        # only for tar filetype
         self._local_data = TarLocalData(tar2info={}, tar2object={})
-        self._augmentation_pipeline = AugmentationPipeline(
+
+        self.augmentation = AugmentationPipeline(
             augmentation_config=aug_file.read(), random_seed=random_seed)
 
         self._normalizer = FeatureNormalizer(
             mean_std_filepath) if mean_std_filepath else None
 
-        self._stride_ms = stride_ms
-        self._target_sample_rate = target_sample_rate
-
         self._speech_featurizer = SpeechFeaturizer(
             unit_type=unit_type,
             vocab_filepath=vocab_filepath,
             spm_model_prefix=spm_model_prefix,
-            specgram_type=specgram_type,
+            spectrum_type=spectrum_type,
             feat_dim=feat_dim,
             delta_delta=delta_delta,
             stride_ms=stride_ms,
@@ -187,33 +111,11 @@ class SpeechCollator():
             target_dB=target_dB,
             dither=dither)
 
-    def _parse_tar(self, file):
-        """Parse a tar file to get a tarfile object
-        and a map containing tarinfoes
-        """
-        result = {}
-        f = tarfile.open(file)
-        for tarinfo in f.getmembers():
-            result[tarinfo.name] = tarinfo
-        return f, result
-
-    def _subfile_from_tar(self, file):
-        """Get subfile object from tar.
-
-        It will return a subfile object from tar file
-        and cached tar file info for next reading request.
-        """
-        tarpath, filename = file.split(':', 1)[1].split('#', 1)
-        if 'tar2info' not in self._local_data.__dict__:
-            self._local_data.tar2info = {}
-        if 'tar2object' not in self._local_data.__dict__:
-            self._local_data.tar2object = {}
-        if tarpath not in self._local_data.tar2info:
-            object, infoes = self._parse_tar(tarpath)
-            self._local_data.tar2info[tarpath] = infoes
-            self._local_data.tar2object[tarpath] = object
-        return self._local_data.tar2object[tarpath].extractfile(
-            self._local_data.tar2info[tarpath][filename])
+        self.feature_size = self._speech_featurizer.audio_feature.feature_size
+        self.text_feature = self._speech_featurizer.text_feature
+        self.vocab_dict = self.text_feature.vocab_dict
+        self.vocab_list = self.text_feature.vocab_list
+        self.vocab_size = self.text_feature.vocab_size
 
     def process_utterance(self, audio_file, transcript):
         """Load, augment, featurize and normalize for speech data.
@@ -226,23 +128,36 @@ class SpeechCollator():
                  where transcription part could be token ids or text.
         :rtype: tuple of (2darray, list)
         """
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), transcript)
+        filetype = self.loader.file_type(audio_file)
+
+        if filetype != 'sound':
+            spectrum = self.loader._get_from_loader(audio_file, filetype)
+            feat_dim = spectrum.shape[1]
+            assert feat_dim == self.feat_dim, f"expect feat dim {self.feat_dim}, but got {feat_dim}"
+
+            if self.keep_transcription_text:
+                transcript_part = transcript
+            else:
+                text_ids = self.text_feature.featurize(transcript)
+                transcript_part = text_ids
         else:
-            speech_segment = SpeechSegment.from_file(audio_file, transcript)
+            # read audio
+            speech_segment = SpeechSegment.from_file(
+                audio_file, transcript, infos=self._local_data)
+            # audio augment
+            self.augmentation.transform_audio(speech_segment)
 
-        # audio augment
-        self._augmentation_pipeline.transform_audio(speech_segment)
+            # extract speech feature
+            spectrum, transcript_part = self._speech_featurizer.featurize(
+                speech_segment, self.keep_transcription_text)
 
-        specgram, transcript_part = self._speech_featurizer.featurize(
-            speech_segment, self._keep_transcription_text)
-        if self._normalizer:
-            specgram = self._normalizer.apply(specgram)
+            # CMVN spectrum
+            if self._normalizer:
+                spectrum = self._normalizer.apply(spectrum)
 
-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        return specgram, transcript_part
+        # spectrum augment
+        spectrum = self.augmentation.transform_feature(spectrum)
+        return spectrum, transcript_part
 
     def __call__(self, batch):
         """batch examples
@@ -272,16 +187,14 @@ class SpeechCollator():
             audios.append(audio)  # [T, D]
             audio_lens.append(audio.shape[0])
             # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
+            # for training, text is token ids, else text is string, convert to unicode ord
             tokens = []
-            if self._keep_transcription_text:
+            if self.keep_transcription_text:
                 assert isinstance(text, str), (type(text), text)
                 tokens = [ord(t) for t in text]
             else:
                 tokens = text  # token ids
-            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
-                tokens, dtype=np.int64)
+            tokens = np.array(tokens, dtype=np.int64)
             texts.append(tokens)
             text_lens.append(tokens.shape[0])
 
@@ -292,26 +205,162 @@ class SpeechCollator():
         olens = np.array(text_lens).astype(np.int64)
         return utts, xs_pad, ilens, ys_pad, olens
 
-    @property
-    def vocab_size(self):
-        return self._speech_featurizer.vocab_size
 
-    @property
-    def vocab_list(self):
-        return self._speech_featurizer.vocab_list
+class SpeechCollator(SpeechCollatorBase):
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        default = CfgNode(
+            dict(
+                augmentation_config="",
+                random_seed=0,
+                mean_std_filepath="",
+                unit_type="char",
+                vocab_filepath="",
+                spm_model_prefix="",
+                spectrum_type='linear',  # 'linear', 'mfcc', 'fbank'
+                feat_dim=0,  # 'mfcc', 'fbank'
+                delta_delta=False,  # 'mfcc', 'fbank'
+                stride_ms=10.0,  # ms
+                window_ms=20.0,  # ms
+                n_fft=None,  # fft points
+                max_freq=None,  # None for samplerate/2
+                target_sample_rate=16000,  # target sample rate
+                use_dB_normalization=True,
+                target_dB=-20,
+                dither=1.0,  # feature dither
+                keep_transcription_text=False))
+
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
+    @classmethod
+    def from_config(cls, config):
+        """Build a SpeechCollator object from a config.
+
+        Args:
+            config (yacs.config.CfgNode): configs object.
+
+        Returns:
+            SpeechCollator: collator object.
+        """
+        assert 'augmentation_config' in config.collator
+        assert 'keep_transcription_text' in config.collator
+        assert 'mean_std_filepath' in config.collator
+        assert 'vocab_filepath' in config.collator
+        assert 'spectrum_type' in config.collator
+        assert 'n_fft' in config.collator
+        assert config.collator
+
+        if isinstance(config.collator.augmentation_config, (str, bytes)):
+            if config.collator.augmentation_config:
+                aug_file = io.open(
+                    config.collator.augmentation_config,
+                    mode='r',
+                    encoding='utf8')
+            else:
+                aug_file = io.StringIO(initial_value='{}', newline='')
+        else:
+            aug_file = config.collator.augmentation_config
+            assert isinstance(aug_file, io.StringIO)
+
+        speech_collator = cls(
+            aug_file=aug_file,
+            random_seed=0,
+            mean_std_filepath=config.collator.mean_std_filepath,
+            unit_type=config.collator.unit_type,
+            vocab_filepath=config.collator.vocab_filepath,
+            spm_model_prefix=config.collator.spm_model_prefix,
+            spectrum_type=config.collator.spectrum_type,
+            feat_dim=config.collator.feat_dim,
+            delta_delta=config.collator.delta_delta,
+            stride_ms=config.collator.stride_ms,
+            window_ms=config.collator.window_ms,
+            n_fft=config.collator.n_fft,
+            max_freq=config.collator.max_freq,
+            target_sample_rate=config.collator.target_sample_rate,
+            use_dB_normalization=config.collator.use_dB_normalization,
+            target_dB=config.collator.target_dB,
+            dither=config.collator.dither,
+            keep_transcription_text=config.collator.keep_transcription_text)
+        return speech_collator
+
+
+class TripletSpeechCollator(SpeechCollator):
+    def process_utterance(self, audio_file, translation, transcript):
+        """Load, augment, featurize and normalize for speech data.
 
-    @property
-    def vocab_dict(self):
-        return self._speech_featurizer.vocab_dict
+        :param audio_file: Filepath or file object of audio file.
+        :type audio_file: str | file
+        :param translation: translation text.
+        :type translation: str
+        :return: Tuple of audio feature tensor and data of translation part,
+                    where translation part could be token ids or text.
+        :rtype: tuple of (2darray, list)
+        """
+        spectrum, translation_part = super().process_utterance(audio_file,
+                                                               translation)
+        transcript_part = self._speech_featurizer.text_featurize(
+            transcript, self.keep_transcription_text)
+        return spectrum, translation_part, transcript_part
 
-    @property
-    def text_feature(self):
-        return self._speech_featurizer.text_feature
+    def __call__(self, batch):
+        """batch examples
+
+        Args:
+            batch ([List]): batch is (audio, text)
+                audio (np.ndarray) shape (T, D)
+                text (List[int] or str): shape (U,)
 
-    @property
-    def feature_size(self):
-        return self._speech_featurizer.feature_size
+        Returns:
+            tuple(audio, text, audio_lens, text_lens): batched data.
+                audio : (B, Tmax, D)
+                audio_lens: (B)
+                text : (B, Umax)
+                text_lens: (B)
+        """
+        audios = []
+        audio_lens = []
+        translation_text = []
+        translation_text_lens = []
+        transcription_text = []
+        transcription_text_lens = []
 
-    @property
-    def stride_ms(self):
-        return self._speech_featurizer.stride_ms
+        utts = []
+        for utt, audio, translation, transcription in batch:
+            audio, translation, transcription = self.process_utterance(
+                audio, translation, transcription)
+            #utt
+            utts.append(utt)
+            # audio
+            audios.append(audio)  # [T, D]
+            audio_lens.append(audio.shape[0])
+            # text
+            # for training, text is token ids
+            # else text is string, convert to unicode ord
+            tokens = [[], []]
+            for idx, text in enumerate([translation, transcription]):
+                if self.keep_transcription_text:
+                    assert isinstance(text, str), (type(text), text)
+                    tokens[idx] = [ord(t) for t in text]
+                else:
+                    tokens[idx] = text  # token ids
+                tokens[idx] = np.array(tokens[idx], dtype=np.int64)
+
+            translation_text.append(tokens[0])
+            translation_text_lens.append(tokens[0].shape[0])
+            transcription_text.append(tokens[1])
+            transcription_text_lens.append(tokens[1].shape[0])
+
+        padded_audios = pad_sequence(
+            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
+        audio_lens = np.array(audio_lens).astype(np.int64)
+        padded_translation = pad_sequence(
+            translation_text, padding_value=IGNORE_ID).astype(np.int64)
+        translation_lens = np.array(translation_text_lens).astype(np.int64)
+        padded_transcription = pad_sequence(
+            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
+        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
+        return utts, padded_audios, audio_lens, (
+            padded_translation, padded_transcription), (translation_lens,
+                                                        transcription_lens)
diff --git a/deepspeech/io/collator_st.py b/deepspeech/io/collator_st.py
deleted file mode 100644
index 28573366..00000000
--- a/deepspeech/io/collator_st.py
+++ /dev/null
@@ -1,631 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import io
-from collections import namedtuple
-from typing import Optional
-
-import kaldiio
-import numpy as np
-from yacs.config import CfgNode
-
-from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
-from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
-from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
-from deepspeech.frontend.normalizer import FeatureNormalizer
-from deepspeech.frontend.speech import SpeechSegment
-from deepspeech.frontend.utility import IGNORE_ID
-from deepspeech.io.utility import pad_sequence
-from deepspeech.utils.log import Log
-
-__all__ = ["SpeechCollator", "KaldiPrePorocessedCollator"]
-
-logger = Log(__name__).getlog()
-
-# namedtupe need global for pickle.
-TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
-
-
-class SpeechCollator():
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                augmentation_config="",
-                random_seed=0,
-                mean_std_filepath="",
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-                feat_dim=0,  # 'mfcc', 'fbank'
-                delta_delta=False,  # 'mfcc', 'fbank'
-                stride_ms=10.0,  # ms
-                window_ms=20.0,  # ms
-                n_fft=None,  # fft points
-                max_freq=None,  # None for samplerate/2
-                target_sample_rate=16000,  # target sample rate
-                use_dB_normalization=True,
-                target_dB=-20,
-                dither=1.0,  # feature dither
-                keep_transcription_text=False))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
-    @classmethod
-    def from_config(cls, config):
-        """Build a SpeechCollator object from a config.
-
-        Args:
-            config (yacs.config.CfgNode): configs object.
-
-        Returns:
-            SpeechCollator: collator object.
-        """
-        assert 'augmentation_config' in config.collator
-        assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.collator
-        assert 'vocab_filepath' in config.collator
-        assert 'specgram_type' in config.collator
-        assert 'n_fft' in config.collator
-        assert config.collator
-
-        if isinstance(config.collator.augmentation_config, (str, bytes)):
-            if config.collator.augmentation_config:
-                aug_file = io.open(
-                    config.collator.augmentation_config,
-                    mode='r',
-                    encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.collator.augmentation_config
-            assert isinstance(aug_file, io.StringIO)
-
-        speech_collator = cls(
-            aug_file=aug_file,
-            random_seed=0,
-            mean_std_filepath=config.collator.mean_std_filepath,
-            unit_type=config.collator.unit_type,
-            vocab_filepath=config.collator.vocab_filepath,
-            spm_model_prefix=config.collator.spm_model_prefix,
-            specgram_type=config.collator.specgram_type,
-            feat_dim=config.collator.feat_dim,
-            delta_delta=config.collator.delta_delta,
-            stride_ms=config.collator.stride_ms,
-            window_ms=config.collator.window_ms,
-            n_fft=config.collator.n_fft,
-            max_freq=config.collator.max_freq,
-            target_sample_rate=config.collator.target_sample_rate,
-            use_dB_normalization=config.collator.use_dB_normalization,
-            target_dB=config.collator.target_dB,
-            dither=config.collator.dither,
-            keep_transcription_text=config.collator.keep_transcription_text)
-        return speech_collator
-
-    def __init__(
-            self,
-            aug_file,
-            mean_std_filepath,
-            vocab_filepath,
-            spm_model_prefix,
-            random_seed=0,
-            unit_type="char",
-            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-            feat_dim=0,  # 'mfcc', 'fbank'
-            delta_delta=False,  # 'mfcc', 'fbank'
-            stride_ms=10.0,  # ms
-            window_ms=20.0,  # ms
-            n_fft=None,  # fft points
-            max_freq=None,  # None for samplerate/2
-            target_sample_rate=16000,  # target sample rate
-            use_dB_normalization=True,
-            target_dB=-20,
-            dither=1.0,
-            keep_transcription_text=True):
-        """SpeechCollator Collator
-
-        Args:
-            unit_type(str): token unit type, e.g. char, word, spm
-            vocab_filepath (str): vocab file path.
-            mean_std_filepath (str): mean and std file path, which suffix is *.npy
-            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
-            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
-            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
-            window_ms (float, optional): window size in ms. Defaults to 20.0.
-            n_fft (int, optional): fft points for rfft. Defaults to None.
-            max_freq (int, optional): max cut freq. Defaults to None.
-            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
-            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
-            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
-            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
-            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
-            target_dB (int, optional): target dB. Defaults to -20.
-            random_seed (int, optional): for random generator. Defaults to 0.
-            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
-            if ``keep_transcription_text`` is False, text is token ids else is raw string.
-
-        Do augmentations
-        Padding audio features with zeros to make them have the same shape (or
-        a user-defined shape) within one batch.
-        """
-        self._keep_transcription_text = keep_transcription_text
-
-        self._local_data = TarLocalData(tar2info={}, tar2object={})
-        self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=aug_file.read(), random_seed=random_seed)
-
-        self._normalizer = FeatureNormalizer(
-            mean_std_filepath) if mean_std_filepath else None
-
-        self._stride_ms = stride_ms
-        self._target_sample_rate = target_sample_rate
-
-        self._speech_featurizer = SpeechFeaturizer(
-            unit_type=unit_type,
-            vocab_filepath=vocab_filepath,
-            spm_model_prefix=spm_model_prefix,
-            specgram_type=specgram_type,
-            feat_dim=feat_dim,
-            delta_delta=delta_delta,
-            stride_ms=stride_ms,
-            window_ms=window_ms,
-            n_fft=n_fft,
-            max_freq=max_freq,
-            target_sample_rate=target_sample_rate,
-            use_dB_normalization=use_dB_normalization,
-            target_dB=target_dB,
-            dither=dither)
-
-    def _parse_tar(self, file):
-        """Parse a tar file to get a tarfile object
-        and a map containing tarinfoes
-        """
-        result = {}
-        f = tarfile.open(file)
-        for tarinfo in f.getmembers():
-            result[tarinfo.name] = tarinfo
-        return f, result
-
-    def _subfile_from_tar(self, file):
-        """Get subfile object from tar.
-
-        It will return a subfile object from tar file
-        and cached tar file info for next reading request.
-        """
-        tarpath, filename = file.split(':', 1)[1].split('#', 1)
-        if 'tar2info' not in self._local_data.__dict__:
-            self._local_data.tar2info = {}
-        if 'tar2object' not in self._local_data.__dict__:
-            self._local_data.tar2object = {}
-        if tarpath not in self._local_data.tar2info:
-            object, infoes = self._parse_tar(tarpath)
-            self._local_data.tar2info[tarpath] = infoes
-            self._local_data.tar2object[tarpath] = object
-        return self._local_data.tar2object[tarpath].extractfile(
-            self._local_data.tar2info[tarpath][filename])
-
-    @property
-    def manifest(self):
-        return self._manifest
-
-    @property
-    def vocab_size(self):
-        return self._speech_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        return self._speech_featurizer.vocab_list
-
-    @property
-    def vocab_dict(self):
-        return self._speech_featurizer.vocab_dict
-
-    @property
-    def text_feature(self):
-        return self._speech_featurizer.text_feature
-
-    @property
-    def feature_size(self):
-        return self._speech_featurizer.feature_size
-
-    @property
-    def stride_ms(self):
-        return self._speech_featurizer.stride_ms
-
-    def process_utterance(self, audio_file, translation):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of audio file.
-        :type audio_file: str | file
-        :param translation: translation text.
-        :type translation: str
-        :return: Tuple of audio feature tensor and data of translation part,
-                 where translation part could be token ids or text.
-        :rtype: tuple of (2darray, list)
-        """
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), translation)
-        else:
-            speech_segment = SpeechSegment.from_file(audio_file, translation)
-
-        # audio augment
-        self._augmentation_pipeline.transform_audio(speech_segment)
-
-        specgram, translation_part = self._speech_featurizer.featurize(
-            speech_segment, self._keep_transcription_text)
-        if self._normalizer:
-            specgram = self._normalizer.apply(specgram)
-
-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        return specgram, translation_part
-
-    def __call__(self, batch):
-        """batch examples
-
-        Args:
-            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (T, D)
-                text (List[int] or str): shape (U,)
-
-        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
-        """
-        audios = []
-        audio_lens = []
-        texts = []
-        text_lens = []
-        utts = []
-        for utt, audio, text in batch:
-            audio, text = self.process_utterance(audio, text)
-            #utt
-            utts.append(utt)
-            # audio
-            audios.append(audio)  # [T, D]
-            audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
-            tokens = []
-            if self._keep_transcription_text:
-                assert isinstance(text, str), (type(text), text)
-                tokens = [ord(t) for t in text]
-            else:
-                tokens = text  # token ids
-            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
-                tokens, dtype=np.int64)
-            texts.append(tokens)
-            text_lens.append(tokens.shape[0])
-
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_texts = pad_sequence(
-            texts, padding_value=IGNORE_ID).astype(np.int64)
-        text_lens = np.array(text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, padded_texts, text_lens
-
-
-class TripletSpeechCollator(SpeechCollator):
-    def process_utterance(self, audio_file, translation, transcript):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of audio file.
-        :type audio_file: str | file
-        :param translation: translation text.
-        :type translation: str
-        :return: Tuple of audio feature tensor and data of translation part,
-                    where translation part could be token ids or text.
-        :rtype: tuple of (2darray, list)
-        """
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), translation)
-        else:
-            speech_segment = SpeechSegment.from_file(audio_file, translation)
-
-        # audio augment
-        self._augmentation_pipeline.transform_audio(speech_segment)
-
-        specgram, translation_part = self._speech_featurizer.featurize(
-            speech_segment, self._keep_transcription_text)
-        transcript_part = self._speech_featurizer._text_featurizer.featurize(
-            transcript)
-        if self._normalizer:
-            specgram = self._normalizer.apply(specgram)
-
-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        return specgram, translation_part, transcript_part
-
-    def __call__(self, batch):
-        """batch examples
-
-        Args:
-            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (T, D)
-                text (List[int] or str): shape (U,)
-
-        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
-        """
-        audios = []
-        audio_lens = []
-        translation_text = []
-        translation_text_lens = []
-        transcription_text = []
-        transcription_text_lens = []
-
-        utts = []
-        for utt, audio, translation, transcription in batch:
-            audio, translation, transcription = self.process_utterance(
-                audio, translation, transcription)
-            #utt
-            utts.append(utt)
-            # audio
-            audios.append(audio)  # [T, D]
-            audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
-            tokens = [[], []]
-            for idx, text in enumerate([translation, transcription]):
-                if self._keep_transcription_text:
-                    assert isinstance(text, str), (type(text), text)
-                    tokens[idx] = [ord(t) for t in text]
-                else:
-                    tokens[idx] = text  # token ids
-                tokens[idx] = tokens[idx] if isinstance(
-                    tokens[idx], np.ndarray) else np.array(
-                        tokens[idx], dtype=np.int64)
-            translation_text.append(tokens[0])
-            translation_text_lens.append(tokens[0].shape[0])
-            transcription_text.append(tokens[1])
-            transcription_text_lens.append(tokens[1].shape[0])
-
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_translation = pad_sequence(
-            translation_text, padding_value=IGNORE_ID).astype(np.int64)
-        translation_lens = np.array(translation_text_lens).astype(np.int64)
-        padded_transcription = pad_sequence(
-            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
-        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, (
-            padded_translation, padded_transcription), (translation_lens,
-                                                        transcription_lens)
-
-
-class KaldiPrePorocessedCollator(SpeechCollator):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                augmentation_config="",
-                random_seed=0,
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                feat_dim=0,
-                stride_ms=10.0,
-                keep_transcription_text=False))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
-    @classmethod
-    def from_config(cls, config):
-        """Build a SpeechCollator object from a config.
-
-        Args:
-            config (yacs.config.CfgNode): configs object.
-
-        Returns:
-            SpeechCollator: collator object.
-        """
-        assert 'augmentation_config' in config.collator
-        assert 'keep_transcription_text' in config.collator
-        assert 'vocab_filepath' in config.collator
-        assert config.collator
-
-        if isinstance(config.collator.augmentation_config, (str, bytes)):
-            if config.collator.augmentation_config:
-                aug_file = io.open(
-                    config.collator.augmentation_config,
-                    mode='r',
-                    encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.collator.augmentation_config
-            assert isinstance(aug_file, io.StringIO)
-
-        speech_collator = cls(
-            aug_file=aug_file,
-            random_seed=0,
-            unit_type=config.collator.unit_type,
-            vocab_filepath=config.collator.vocab_filepath,
-            spm_model_prefix=config.collator.spm_model_prefix,
-            feat_dim=config.collator.feat_dim,
-            stride_ms=config.collator.stride_ms,
-            keep_transcription_text=config.collator.keep_transcription_text)
-        return speech_collator
-
-    def __init__(self,
-                 aug_file,
-                 vocab_filepath,
-                 spm_model_prefix,
-                 random_seed=0,
-                 unit_type="char",
-                 feat_dim=0,
-                 stride_ms=10.0,
-                 keep_transcription_text=True):
-        """SpeechCollator Collator
-
-        Args:
-            unit_type(str): token unit type, e.g. char, word, spm
-            vocab_filepath (str): vocab file path.
-            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
-            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
-            random_seed (int, optional): for random generator. Defaults to 0.
-            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
-            if ``keep_transcription_text`` is False, text is token ids else is raw string.
-
-        Do augmentations
-        Padding audio features with zeros to make them have the same shape (or
-        a user-defined shape) within one batch.
-        """
-        self._keep_transcription_text = keep_transcription_text
-        self._feat_dim = feat_dim
-        self._stride_ms = stride_ms
-
-        self._local_data = TarLocalData(tar2info={}, tar2object={})
-        self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=aug_file.read(), random_seed=random_seed)
-
-        self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath,
-                                               spm_model_prefix)
-
-    def process_utterance(self, audio_file, translation):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of kaldi processed feature.
-        :type audio_file: str | file
-        :param translation: Translation text.
-        :type translation: str
-        :return: Tuple of audio feature tensor and data of translation part,
-                 where translation part could be token ids or text.
-        :rtype: tuple of (2darray, list)
-        """
-        specgram = kaldiio.load_mat(audio_file)
-        assert specgram.shape[
-            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
-                self._feat_dim, specgram.shape[1])
-
-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-
-        if self._keep_transcription_text:
-            return specgram, translation
-        else:
-            text_ids = self._text_featurizer.featurize(translation)
-            return specgram, text_ids
-
-
-class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):
-    def process_utterance(self, audio_file, translation, transcript):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of kali processed feature.
-        :type audio_file: str | file
-        :param translation: Translation text.
-        :type translation: str
-        :param transcript: Transcription text.
-        :type transcript: str
-        :return: Tuple of audio feature tensor and data of translation and transcription parts,
-                 where translation and transcription parts could be token ids or text.
-        :rtype: tuple of (2darray, (list, list))
-        """
-        specgram = kaldiio.load_mat(audio_file)
-        assert specgram.shape[
-            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
-                self._feat_dim, specgram.shape[1])
-
-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-
-        if self._keep_transcription_text:
-            return specgram, translation, transcript
-        else:
-            translation_text_ids = self._text_featurizer.featurize(translation)
-            transcript_text_ids = self._text_featurizer.featurize(transcript)
-            return specgram, translation_text_ids, transcript_text_ids
-
-    def __call__(self, batch):
-        """batch examples
-
-        Args:
-            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (T, D)
-                translation (List[int] or str): shape (U,)
-                transcription (List[int] or str): shape (V,)
-
-        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                translation_text : (B, Umax)
-                translation_text_lens: (B)
-                transcription_text : (B, Vmax)
-                transcription_text_lens: (B)
-        """
-        audios = []
-        audio_lens = []
-        translation_text = []
-        translation_text_lens = []
-        transcription_text = []
-        transcription_text_lens = []
-
-        utts = []
-        for utt, audio, translation, transcription in batch:
-            audio, translation, transcription = self.process_utterance(
-                audio, translation, transcription)
-            #utt
-            utts.append(utt)
-            # audio
-            audios.append(audio)  # [T, D]
-            audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
-            tokens = [[], []]
-            for idx, text in enumerate([translation, transcription]):
-                if self._keep_transcription_text:
-                    assert isinstance(text, str), (type(text), text)
-                    tokens[idx] = [ord(t) for t in text]
-                else:
-                    tokens[idx] = text  # token ids
-                tokens[idx] = tokens[idx] if isinstance(
-                    tokens[idx], np.ndarray) else np.array(
-                        tokens[idx], dtype=np.int64)
-            translation_text.append(tokens[0])
-            translation_text_lens.append(tokens[0].shape[0])
-            transcription_text.append(tokens[1])
-            transcription_text_lens.append(tokens[1].shape[0])
-
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_translation = pad_sequence(
-            translation_text, padding_value=IGNORE_ID).astype(np.int64)
-        translation_lens = np.array(translation_text_lens).astype(np.int64)
-        padded_transcription = pad_sequence(
-            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
-        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, (
-            padded_translation, padded_transcription), (translation_lens,
-                                                        transcription_lens)
diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py
index 95cdbb95..30ae98f0 100644
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@@ -321,6 +321,22 @@ class LoadInputsAndTargets():
             raise NotImplementedError(
                 "Not supported: loader_type={}".format(filetype))
 
+    def file_type(self, filepath):
+        suffix = filepath.split(":")[0].split('.')[1]
+        if suffix == 'ark':
+            return 'mat'
+        elif suffix == 'scp':
+            return 'scp'
+        elif suffix == 'npy':
+            return 'npy'
+        elif suffix == 'npz':
+            return 'npz'
+        elif suffix in ['wav', 'flac']:
+            # PCM16
+            return 'sound'
+        else:
+            raise ValueError(f"Not support filetype: {suffix}")
+
 
 class SoundHDF5File():
     """Collecting sound files to a HDF5 file

From 35cbbc8a389b32408d75ef6637c232cfd36841a3 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 28 Sep 2021 10:29:49 +0000
Subject: [PATCH 06/11] add requirements for hub

---
 hub/requirements.txt | 26 +++++++++++++++++
 hub/setup_hub.sh     | 66 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 hub/requirements.txt
 create mode 100644 hub/setup_hub.sh

diff --git a/hub/requirements.txt b/hub/requirements.txt
new file mode 100644
index 00000000..c4c7d022
--- /dev/null
+++ b/hub/requirements.txt
@@ -0,0 +1,26 @@
+coverage
+gpustat
+jsonlines
+kaldiio
+llvmlite==0.31.0
+loguru
+numba==0.47.0
+numpy==1.18.5
+Pillow
+pre-commit
+pybind11
+python-speech-features
+resampy==0.2.2
+sacrebleu
+scipy==1.2.1
+sentencepiece
+snakeviz
+SoundFile==0.9.0.post1
+sox
+soxbindings
+tensorboardX
+textgrid
+tqdm
+typeguard
+visualdl==2.2.0
+yacs
diff --git a/hub/setup_hub.sh b/hub/setup_hub.sh
new file mode 100644
index 00000000..f2d43ad1
--- /dev/null
+++ b/hub/setup_hub.sh
@@ -0,0 +1,66 @@
+#! /usr/bin/env  bash
+cd .. >> /dev/null
+source utils/log.sh
+
+
+SUDO='sudo'
+if [ $(id -u) -eq 0 ]; then
+  SUDO=''
+fi
+
+if [ -e /etc/lsb-release ];then
+    ${SUDO} apt-get update -y
+    ${SUDO} apt-get install -y jq vim tig tree sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
+    if [ $? != 0 ]; then
+        error_msg "Please using Ubuntu or install pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev by user."
+        exit -1
+    fi
+fi
+
+
+source tools/venv/bin/activate
+
+cd -
+#install python dependencies
+if [ -f "requirements.txt" ]; then
+    pip3 install -r requirements.txt
+fi
+if [ $? != 0 ]; then
+    error_msg "Install python dependencies failed !!!"
+    exit 1
+fi
+cd .. >> /dev/null
+
+# install package libsndfile
+python3 -c "import soundfile"
+if [ $? != 0 ]; then
+    info_msg "Install package libsndfile into default system path."
+    wget "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
+    if [ $? != 0 ]; then
+        error_msg "Download libsndfile-1.0.28.tar.gz failed !!!"
+        exit 1
+    fi
+    tar -zxvf libsndfile-1.0.28.tar.gz
+    cd libsndfile-1.0.28
+    ./configure > /dev/null && make > /dev/null && make install > /dev/null
+    cd ..
+    rm -rf libsndfile-1.0.28
+    rm libsndfile-1.0.28.tar.gz
+fi
+
+
+# install decoders
+python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")"
+if [ $? != 0 ]; then
+    cd deepspeech/decoders/swig > /dev/null
+    sh setup.sh
+    cd - > /dev/null
+fi
+python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")"
+if [ $? != 0 ]; then
+   error_msg "Please check why decoder install error!"
+   exit -1
+fi
+
+
+info_msg "Install all dependencies successfully."

From b7b1bda34f920ef457486fd7a494464c31540a4a Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 28 Sep 2021 11:47:53 +0000
Subject: [PATCH 07/11] test refactor collator

---
 deepspeech/exps/u2_st/model.py                |   6 +-
 .../frontend/featurizer/audio_featurizer.py   |  32 +++---
 .../frontend/featurizer/speech_featurizer.py  |  36 +-----
 deepspeech/frontend/utility.py                |   1 +
 deepspeech/io/collator.py                     | 108 ++++++++++--------
 deepspeech/io/dataset.py                      |  21 +---
 deepspeech/io/reader.py                       |   2 +-
 docs/src/data_preparation.md                  |   2 +-
 docs/src/deepspeech_architecture.md           |   2 +-
 examples/1xt2x/aishell/conf/deepspeech2.yaml  |   2 +-
 .../1xt2x/baidu_en8k/conf/deepspeech2.yaml    |   2 +-
 .../1xt2x/librispeech/conf/deepspeech2.yaml   |   2 +-
 examples/aishell/s0/conf/deepspeech2.yaml     |   2 +-
 .../aishell/s0/conf/deepspeech2_online.yaml   |   2 +-
 examples/aishell/s0/local/data.sh             |   2 +-
 examples/aishell/s1/conf/chunk_conformer.yaml |   2 +-
 examples/aishell/s1/conf/conformer.yaml       |   2 +-
 examples/aishell/s1/local/data.sh             |   2 +-
 .../callcenter/s1/conf/chunk_conformer.yaml   |   2 +-
 examples/callcenter/s1/conf/conformer.yaml    |   2 +-
 examples/callcenter/s1/local/data.sh          |   2 +-
 examples/librispeech/s0/conf/deepspeech2.yaml |   2 +-
 .../s0/conf/deepspeech2_online.yaml           |   2 +-
 examples/librispeech/s0/local/data.sh         |   2 +-
 .../librispeech/s1/conf/chunk_conformer.yaml  |   2 +-
 .../s1/conf/chunk_transformer.yaml            |   2 +-
 examples/librispeech/s1/conf/conformer.yaml   |   2 +-
 examples/librispeech/s1/conf/transformer.yaml |   2 +-
 examples/librispeech/s1/local/data.sh         |   2 +-
 .../librispeech/s2/conf/chunk_conformer.yaml  |   2 +-
 .../s2/conf/chunk_transformer.yaml            |   2 +-
 examples/librispeech/s2/conf/conformer.yaml   |   2 +-
 examples/librispeech/s2/local/data.sh         |   2 +-
 examples/ted_en_zh/t0/conf/transformer.yaml   |   2 +-
 .../t0/conf/transformer_joint_noam.yaml       |   2 +-
 examples/ted_en_zh/t0/local/data.sh           |   2 +-
 examples/timit/s1/conf/transformer.yaml       |   2 +-
 examples/timit/s1/local/data.sh               |   2 +-
 examples/tiny/s0/conf/deepspeech2.yaml        |   2 +-
 examples/tiny/s0/conf/deepspeech2_online.yaml |   2 +-
 examples/tiny/s0/local/data.sh                |   2 +-
 examples/tiny/s1/conf/chunk_confermer.yaml    |   2 +-
 examples/tiny/s1/conf/chunk_transformer.yaml  |   2 +-
 examples/tiny/s1/conf/conformer.yaml          |   2 +-
 examples/tiny/s1/conf/transformer.yaml        |   2 +-
 examples/tiny/s1/local/data.sh                |   2 +-
 utils/compute_mean_std.py                     |   4 +-
 47 files changed, 125 insertions(+), 163 deletions(-)

diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py
index f5a514c7..9a34cbdc 100644
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -31,7 +31,6 @@ from yacs.config import CfgNode
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.collator import TripletSpeechCollator
 from deepspeech.io.dataset import ManifestDataset
-from deepspeech.io.dataset import TripletManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2_st import U2STModel
@@ -249,12 +248,11 @@ class U2STTrainer(Trainer):
         config.collator.keep_transcription_text = False
 
         # train/valid dataset, return token ids
-        Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset
         config.data.manifest = config.data.train_manifest
-        train_dataset = Dataset.from_config(config)
+        train_dataset = ManifestDataset.from_config(config)
 
         config.data.manifest = config.data.dev_manifest
-        dev_dataset = Dataset.from_config(config)
+        dev_dataset = ManifestDataset.from_config(config)
 
         if config.model.model_conf.asr_weight > 0.:
             Collator = TripletSpeechCollator
diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py
index 4c40c847..6f3b646c 100644
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -24,15 +24,15 @@ class AudioFeaturizer():
 
     Currently, it supports feature types of linear spectrogram and mfcc.
 
-    :param specgram_type: Specgram feature type. Options: 'linear'.
-    :type specgram_type: str
+    :param spectrum_type: Specgram feature type. Options: 'linear'.
+    :type spectrum_type: str
     :param stride_ms: Striding size (in milliseconds) for generating frames.
     :type stride_ms: float
     :param window_ms: Window size (in milliseconds) for generating frames.
     :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
+    :param max_freq: When spectrum_type is 'linear', only FFT bins
                      corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_feq is the
+                     returned; when spectrum_type is 'mfcc', max_feq is the
                      highest band edge of mel filters.
     :types max_freq: None|float
     :param target_sample_rate: Audio are resampled (if upsampling or
@@ -47,7 +47,7 @@ class AudioFeaturizer():
     """
 
     def __init__(self,
-                 specgram_type: str='linear',
+                 spectrum_type: str='linear',
                  feat_dim: int=None,
                  delta_delta: bool=False,
                  stride_ms=10.0,
@@ -58,7 +58,7 @@ class AudioFeaturizer():
                  use_dB_normalization=True,
                  target_dB=-20,
                  dither=1.0):
-        self._specgram_type = specgram_type
+        self._spectrum_type = spectrum_type
         # mfcc and fbank using `feat_dim`
         self._feat_dim = feat_dim
         # mfcc and fbank using `delta-delta`
@@ -113,27 +113,27 @@ class AudioFeaturizer():
     def feature_size(self):
         """audio feature size"""
         feat_dim = 0
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
             fft_point = self._window_ms if self._fft_point is None else self._fft_point
             feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
                            1)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
             # mfcc, delta, delta-delta
             feat_dim = int(self._feat_dim *
                            3) if self._delta_delta else int(self._feat_dim)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
             # fbank, delta, delta-delta
             feat_dim = int(self._feat_dim *
                            3) if self._delta_delta else int(self._feat_dim)
         else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
         return feat_dim
 
     def _compute_specgram(self, audio_segment):
         """Extract various audio features."""
         sample_rate = audio_segment.sample_rate
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
             samples = audio_segment.samples
             return self._compute_linear_specgram(
                 samples,
@@ -141,7 +141,7 @@ class AudioFeaturizer():
                 stride_ms=self._stride_ms,
                 window_ms=self._window_ms,
                 max_freq=self._max_freq)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
             samples = audio_segment.to('int16')
             return self._compute_mfcc(
                 samples,
@@ -152,7 +152,7 @@ class AudioFeaturizer():
                 max_freq=self._max_freq,
                 dither=self._dither,
                 delta_delta=self._delta_delta)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
             samples = audio_segment.to('int16')
             return self._compute_fbank(
                 samples,
@@ -164,8 +164,8 @@ class AudioFeaturizer():
                 dither=self._dither,
                 delta_delta=self._delta_delta)
         else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
 
     def _specgram_real(self, samples, window_size, stride_size, sample_rate):
         """Compute the spectrogram for samples from a real signal."""
diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py
index f9f7d7c2..7471d164 100644
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 
 
 class SpeechFeaturizer():
-    """Speech featurizer, for extracting features from both audio and transcript
-    contents of SpeechSegment.
-
-    Currently, for audio parts, it supports feature types of linear
-    spectrogram and mfcc; for transcript parts, it only supports char-level
-    tokenizing and conversion into a list of token indices. Note that the
-    token indexing order follows the given vocabulary file.
-
-    :param vocab_filepath: Filepath to load vocabulary for token indices
-                           conversion.
-    :type specgram_type: str
-    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
-    :type specgram_type: str
-    :param stride_ms: Striding size (in milliseconds) for generating frames.
-    :type stride_ms: float
-    :param window_ms: Window size (in milliseconds) for generating frames.
-    :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
-                     corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_freq is the
-                     highest band edge of mel filters.
-    :types max_freq: None|float
-    :param target_sample_rate: Speech are resampled (if upsampling or
-                               downsampling is allowed) to this before
-                               extracting spectrogram features.
-    :type target_sample_rate: float
-    :param use_dB_normalization: Whether to normalize the audio to a certain
-                                 decibels before extracting the features.
-    :type use_dB_normalization: bool
-    :param target_dB: Target audio decibels for normalization.
-    :type target_dB: float
+    """Speech and Text feature extraction.
     """
 
     def __init__(self,
                  unit_type,
                  vocab_filepath,
                  spm_model_prefix=None,
-                 specgram_type='linear',
+                 spectrum_type='linear',
                  feat_dim=None,
                  delta_delta=False,
                  stride_ms=10.0,
@@ -70,7 +40,7 @@ class SpeechFeaturizer():
         self.window_ms = window_ms
 
         self.audio_feature = AudioFeaturizer(
-            specgram_type=specgram_type,
+            spectrum_type=spectrum_type,
             feat_dim=feat_dim,
             delta_delta=delta_delta,
             stride_ms=stride_ms,
diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py
index 2a581232..f5fc3097 100644
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -15,6 +15,7 @@
 import json
 import math
 import tarfile
+from collections import namedtuple
 from typing import List
 from typing import Optional
 from typing import Text
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index c5c0a414..553ffcb5 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
 logger = Log(__name__).getlog()
 
 
+def tokenids(text, keep_transcription_text):
+    # for training text is token ids 
+    tokens = text  # token ids
+
+    if keep_transcription_text:
+        # text is string, convert to unicode ord
+        assert isinstance(text, str), (type(text), text)
+        tokens = [ord(t) for t in text]
+
+    tokens = np.array(tokens, dtype=np.int64)
+    return tokens
+
+
 class SpeechCollatorBase():
     def __init__(
             self,
@@ -150,7 +163,6 @@ class SpeechCollatorBase():
             # extract speech feature
             spectrum, transcript_part = self._speech_featurizer.featurize(
                 speech_segment, self.keep_transcription_text)
-
             # CMVN spectrum
             if self._normalizer:
                 spectrum = self._normalizer.apply(spectrum)
@@ -163,38 +175,35 @@ class SpeechCollatorBase():
         """batch examples
 
         Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                 audio (np.ndarray) shape (T, D)
                 text (List[int] or str): shape (U,)
 
         Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : (B, Umax)
+                olens: (B,)
         """
         audios = []
         audio_lens = []
         texts = []
         text_lens = []
         utts = []
-        for utt, audio, text in batch:
+
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            text = item['text']
             audio, text = self.process_utterance(audio, text)
-            #utt
-            utts.append(utt)
-            # audio
+
             audios.append(audio)  # [T, D]
             audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids, else text is string, convert to unicode ord
-            tokens = []
-            if self.keep_transcription_text:
-                assert isinstance(text, str), (type(text), text)
-                tokens = [ord(t) for t in text]
-            else:
-                tokens = text  # token ids
-            tokens = np.array(tokens, dtype=np.int64)
+
+            tokens = tokenids(text, self.keep_transcription_text)
             texts.append(tokens)
             text_lens.append(tokens.shape[0])
 
@@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator):
         """batch examples
 
         Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                 audio (np.ndarray) shape (T, D)
                 text (List[int] or str): shape (U,)
 
         Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : [(B, Umax), (B, Umax)]
+                olens: [(B,), (B,)]
         """
+        utts = []
         audios = []
         audio_lens = []
         translation_text = []
@@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator):
         transcription_text = []
         transcription_text_lens = []
 
-        utts = []
-        for utt, audio, translation, transcription in batch:
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            translation = item['text']
+            transcription = item['text1']
             audio, translation, transcription = self.process_utterance(
                 audio, translation, transcription)
-            #utt
-            utts.append(utt)
-            # audio
+
             audios.append(audio)  # [T, D]
             audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
+
             tokens = [[], []]
             for idx, text in enumerate([translation, transcription]):
-                if self.keep_transcription_text:
-                    assert isinstance(text, str), (type(text), text)
-                    tokens[idx] = [ord(t) for t in text]
-                else:
-                    tokens[idx] = text  # token ids
-                tokens[idx] = np.array(tokens[idx], dtype=np.int64)
+                tokens[idx] = tokenids(text, self.keep_transcription_text)
 
             translation_text.append(tokens[0])
             translation_text_lens.append(tokens[0].shape[0])
             transcription_text.append(tokens[1])
             transcription_text_lens.append(tokens[1].shape[0])
 
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_translation = pad_sequence(
-            translation_text, padding_value=IGNORE_ID).astype(np.int64)
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)  #[B, T, D]
+        ilens = np.array(audio_lens).astype(np.int64)
+
+        padded_translation = pad_list(translation_text,
+                                      IGNORE_ID).astype(np.int64)
         translation_lens = np.array(translation_text_lens).astype(np.int64)
-        padded_transcription = pad_sequence(
-            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
+
+        padded_transcription = pad_list(transcription_text,
+                                        IGNORE_ID).astype(np.int64)
         transcription_lens = np.array(transcription_text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, (
-            padded_translation, padded_transcription), (translation_lens,
-                                                        transcription_lens)
+
+        ys_pad = (padded_translation, padded_transcription)
+        olens = (translation_lens, transcription_lens)
+        return utts, xs_pad, ilens, ys_pad, olens
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 56e53475..1945c5f7 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
 
-__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
+__all__ = ["ManifestDataset", "TransformDataset"]
 
 logger = Log(__name__).getlog()
 
@@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
         return len(self._manifest)
 
     def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"]
-
-
-class TripletManifestDataset(ManifestDataset):
-    """
-        For Joint Training of Speech Translation and ASR.
-        text: translation,
-        text1: transcript.
-    """
-
-    def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"], instance[
-            "text1"]
+        return self._manifest[idx]
 
 
 class TransformDataset(Dataset):
@@ -273,5 +259,4 @@ class AudioDataset(Dataset):
         return len(self.minibatch)
 
     def __getitem__(self, idx):
-        instance = self.minibatch[idx]
-        return instance["utt"], instance["feat"], instance["text"]
+        return self.minibatch[idx]
diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py
index 30ae98f0..e7c43a78 100644
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@@ -322,7 +322,7 @@ class LoadInputsAndTargets():
                 "Not supported: loader_type={}".format(filetype))
 
     def file_type(self, filepath):
-        suffix = filepath.split(":")[0].split('.')[1]
+        suffix = filepath.split(":")[0].split('.')[-1]
         if suffix == 'ark':
             return 'mat'
         elif suffix == 'scp':
diff --git a/docs/src/data_preparation.md b/docs/src/data_preparation.md
index a3d1b3eb..34d2a835 100644
--- a/docs/src/data_preparation.md
+++ b/docs/src/data_preparation.md
@@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
 ```bash
 python3 utils/compute_mean_std.py \
 --num_samples 2000 \
---specgram_type linear \
+--spectrum_type linear \
 --manifest_path examples/librispeech/data/manifest.train \
 --output_path examples/librispeech/data/mean_std.npz
 ```
diff --git a/docs/src/deepspeech_architecture.md b/docs/src/deepspeech_architecture.md
index b9344122..5a6ca886 100644
--- a/docs/src/deepspeech_architecture.md
+++ b/docs/src/deepspeech_architecture.md
@@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
 cd examples/aishell/s0
 python3 ../../../utils/compute_mean_std.py \
      --manifest_path="data/manifest.train.raw" \
-     --specgram_type="linear" \
+     --spectrum_type="linear" \
      --delta_delta=false \
      --stride_ms=10.0 \
      --window_ms=20.0 \
diff --git a/examples/1xt2x/aishell/conf/deepspeech2.yaml b/examples/1xt2x/aishell/conf/deepspeech2.yaml
index 6e745e9d..c2d69226 100644
--- a/examples/1xt2x/aishell/conf/deepspeech2.yaml
+++ b/examples/1xt2x/aishell/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
index fbc7466f..be51a9b9 100644
--- a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/1xt2x/librispeech/conf/deepspeech2.yaml
index edef0797..ad7fb2c1 100644
--- a/examples/1xt2x/librispeech/conf/deepspeech2.yaml
+++ b/examples/1xt2x/librispeech/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml
index 9560930a..ffefaeb3 100644
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml
index 7e87594c..cac599dc 100644
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear #linear, mfcc, fbank
+  spectrum_type: linear #linear, mfcc, fbank
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh
index b106f3f2..1312a12f 100755
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --stride_ms=10.0 \
     --window_ms=20.0 \
diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml
index 6f8ae135..9b563da2 100644
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml
index a4248459..dfa9a4b0 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh
index 8d5ac4d5..c05c3ea2 100755
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --stride_ms=10.0 \
diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/s1/conf/chunk_conformer.yaml
index f79b8eaa..a853658a 100644
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/s1/conf/conformer.yaml
index 3b08cc7a..bd4f4578 100644
--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh
index e2640ead..b2a495b4 100755
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --stride_ms=10.0 \
diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml
index 3f1a376f..47ef9421 100644
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   target_sample_rate: 16000
   max_freq: None
   n_fft: None
diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/s0/conf/deepspeech2_online.yaml
index 180a6205..e2f91094 100644
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   target_sample_rate: 16000
   max_freq: None
   n_fft: None
diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh
index b7180986..e3f7b325 100755
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=2000 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
     --stride_ms=10.0 \
diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml
index 92db20f6..872b560b 100644
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml
index e0bc3135..132a4f9d 100644
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml
index 78be249c..769ed5f5 100644
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml
index e4a06767..c9dc1413 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh
index 4ad476d3..2b6af229 100755
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml
index 92db20f6..872b560b 100644
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml
index e0bc3135..132a4f9d 100644
--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml
index 9a727413..bc87466e 100644
--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/s2/local/data.sh
index 4ad476d3..2b6af229 100755
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml
index 1aad86d2..8c03e328 100644
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
   # augmentation_config: conf/augmentation.json
   batch_size: 10
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
index 0144c40d..cbfae93e 100644
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
@@ -18,7 +18,7 @@ collator:
   # augmentation_config: conf/augmentation.json
   batch_size: 10
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh
index 32cfd9d7..43911c34 100755
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml
index c3b51996..1ae9acd0 100644
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@@ -17,7 +17,7 @@ collator:
   augmentation_config: ""
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh
index 1d16f454..f4be9048 100755
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index 40899655..a7940cb2 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/s0/conf/deepspeech2_online.yaml
index 0098a226..7e30409f 100644
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh
index 02fdb706..fabf2e40 100755
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.tiny.raw" \
     --num_samples=64 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
     --stride_ms=10.0 \
diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml
index be2e82f9..f3c7e1dd 100644
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml
index 93439a85..83005754 100644
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml
index 9bb67c44..628e3b77 100644
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index fcbe1da4..27ffcae4 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh
index 2aea250b..b5dbd581 100755
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.tiny.raw" \
     --num_samples=64 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py
index a468153d..0f63715a 100755
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
 
-add_arg('specgram_type',    str,
+add_arg('spectrum_type',    str,
         'linear',
         "Audio feature type. Options: linear, mfcc, fbank.",
         choices=['linear', 'mfcc', 'fbank'])
@@ -58,7 +58,7 @@ def main():
 
     augmentation_pipeline = AugmentationPipeline('{}')
     audio_featurizer = AudioFeaturizer(
-        specgram_type=args.specgram_type,
+        spectrum_type=args.spectrum_type,
         feat_dim=args.feat_dim,
         delta_delta=args.delta_delta,
         stride_ms=args.stride_ms,

From 856d641c9ce748766ae53c1939fc995dea6aec9a Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 28 Sep 2021 11:48:21 +0000
Subject: [PATCH 08/11] multi worker for dataloader

---
 deepspeech/exps/deepspeech2/model.py | 8 +++++---
 deepspeech/exps/u2/model.py          | 9 ++++++---
 deepspeech/exps/u2_st/model.py       | 9 ++++++---
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index b854a996..e84de615 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -235,16 +235,18 @@ class DeepSpeech2Trainer(Trainer):
             num_workers=config.collator.num_workers)
         self.valid_loader = DataLoader(
             dev_dataset,
-            batch_size=int(config.collator.batch_size / 4),
+            batch_size=int(config.collator.batch_size),
             shuffle=False,
             drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
+            num_workers=config.collator.num_workers)
         self.test_loader = DataLoader(
             test_dataset,
             batch_size=config.decoding.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=collate_fn_test)
+            collate_fn=collate_fn_test,
+            num_workers=config.collator.num_workers)
         logger.info("Setup train/valid/test  Dataloader!")
 
 
diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 1afd9b10..c30f324b 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -292,7 +292,8 @@ class U2Trainer(Trainer):
             batch_size=config.collator.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
+            num_workers=config.collator.num_workers, )
 
         # test dataset, return raw text
         config.data.manifest = config.data.test_manifest
@@ -314,7 +315,8 @@ class U2Trainer(Trainer):
             batch_size=config.decoding.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=SpeechCollator.from_config(config))
+            collate_fn=SpeechCollator.from_config(config),
+            num_workers=config.collator.num_workers, )
         # return text token id
         config.collator.keep_transcription_text = False
         self.align_loader = DataLoader(
@@ -322,7 +324,8 @@ class U2Trainer(Trainer):
             batch_size=config.decoding.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=SpeechCollator.from_config(config))
+            collate_fn=SpeechCollator.from_config(config),
+            num_workers=config.collator.num_workers, )
         logger.info("Setup train/valid/test/align Dataloader!")
 
     def setup_model(self):
diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py
index 9a34cbdc..c480499c 100644
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -292,7 +292,8 @@ class U2STTrainer(Trainer):
             batch_size=config.collator.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
+            num_workers=config.collator.num_workers, )
 
         # test dataset, return raw text
         config.data.manifest = config.data.test_manifest
@@ -313,7 +314,8 @@ class U2STTrainer(Trainer):
             batch_size=config.decoding.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=TestCollator.from_config(config))
+            collate_fn=TestCollator.from_config(config),
+            num_workers=config.collator.num_workers, )
         # return text token id
         config.collator.keep_transcription_text = False
         self.align_loader = DataLoader(
@@ -321,7 +323,8 @@ class U2STTrainer(Trainer):
             batch_size=config.decoding.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=TestCollator.from_config(config))
+            collate_fn=TestCollator.from_config(config),
+            num_workers=config.collator.num_workers, )
         logger.info("Setup train/valid/test/align Dataloader!")
 
     def setup_model(self):

From 3e37cef8e18e7ae7ec65223d147e50d0d091b1d2 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 28 Sep 2021 11:53:00 +0000
Subject: [PATCH 09/11] fix test.sh opts

---
 examples/aishell/s1/local/test.sh    | 6 ++++--
 examples/callcenter/s1/local/test.sh | 6 ++++--
 examples/ted_en_zh/t0/local/test.sh  | 3 ++-
 examples/timit/s1/local/test.sh      | 6 ++++--
 examples/tiny/s1/local/test.sh       | 6 ++++--
 5 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/examples/aishell/s1/local/test.sh b/examples/aishell/s1/local/test.sh
index c87412c9..47bd2f63 100755
--- a/examples/aishell/s1/local/test.sh
+++ b/examples/aishell/s1/local/test.sh
@@ -38,7 +38,8 @@ for type in attention ctc_greedy_search; do
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
@@ -56,7 +57,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/callcenter/s1/local/test.sh b/examples/callcenter/s1/local/test.sh
index dca3137d..0aa99e19 100755
--- a/examples/callcenter/s1/local/test.sh
+++ b/examples/callcenter/s1/local/test.sh
@@ -32,7 +32,8 @@ for type in attention ctc_greedy_search; do
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
@@ -50,7 +51,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     --config ${config_path} \
     --result_file ${output_dir}/${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/ted_en_zh/t0/local/test.sh b/examples/ted_en_zh/t0/local/test.sh
index 34475085..7235c6f9 100755
--- a/examples/ted_en_zh/t0/local/test.sh
+++ b/examples/ted_en_zh/t0/local/test.sh
@@ -19,7 +19,8 @@ for type in fullsentence; do
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/timit/s1/local/test.sh b/examples/timit/s1/local/test.sh
index 868c8fda..05813179 100755
--- a/examples/timit/s1/local/test.sh
+++ b/examples/timit/s1/local/test.sh
@@ -36,7 +36,8 @@ for type in attention ctc_greedy_search; do
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
@@ -52,7 +53,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
diff --git a/examples/tiny/s1/local/test.sh b/examples/tiny/s1/local/test.sh
index 19872bb3..34088ce9 100755
--- a/examples/tiny/s1/local/test.sh
+++ b/examples/tiny/s1/local/test.sh
@@ -35,7 +35,8 @@ for type in attention ctc_greedy_search; do
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"
@@ -51,7 +52,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
     --config ${config_path} \
     --result_file ${ckpt_prefix}.${type}.rsl \
     --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
 
     if [ $? -ne 0 ]; then
         echo "Failed in evaluation!"

From 4745e15ece1aa8a5917ee85fb655cdb4899c8c0b Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 28 Sep 2021 11:55:46 +0000
Subject: [PATCH 10/11] tiny run w cpu

---
 examples/tiny/s1/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh
index d288e31a..6580afed 100755
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
@@ -30,12 +30,12 @@ fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES= ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES= ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then

From e7b0f5ed4aa572ba7245e6f6fc66972702bfd502 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 28 Sep 2021 12:17:04 +0000
Subject: [PATCH 11/11] reader default type is mat, sound need explicitlyc
 specify

---
 utils/format_data.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/format_data.py b/utils/format_data.py
index e7dcfd23..682dbfdb 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -26,7 +26,7 @@ from deepspeech.utils.utility import print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi")
+add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp")
 add_arg('cmvn_path',       str,
         'examples/librispeech/data/mean_std.json',
         "Filepath of cmvn.")
@@ -76,6 +76,7 @@ def main():
             assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
             if args.feat_type == 'raw':
                 feat_shape.append(feat_dim)
+                line_json['filetype'] = 'sound'
             else: # kaldi
                 raise NotImplementedError('no support kaldi feat now!')
             fout.write(json.dumps(line_json) + '\n')