Merge branch 'develop' of https://github.com/LittleChenCc/DeepSpeech into develop

4 years ago · 46df01151f
parent a0c94209e2 74e99c15ab
commit 46df01151f
66 changed files with 788 additions and 1018 deletions
--- a/deepspeech/exps/deepspeech2/bin/test_hub.py
+++ b/deepspeech/exps/deepspeech2/bin/test_hub.py
@ -0,0 +1,191 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for DeepSpeech2 model."""
+import os
+import sys
+from pathlib import Path
+
+import paddle
+
+from deepspeech.exps.deepspeech2.config import get_cfg_defaults
+from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.models.ds2 import DeepSpeech2Model
+from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils import mp_tools
+from deepspeech.utils.checkpoint import Checkpoint
+from deepspeech.utils.log import Log
+from deepspeech.utils.utility import print_arguments
+from deepspeech.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+class DeepSpeech2Tester_hub():
+    def __init__(self, config, args):
+        self.args = args
+        self.config = config
+        self.audio_file = args.audio_file
+        self.collate_fn_test = SpeechCollator.from_config(config)
+        self._text_featurizer = TextFeaturizer(
+            unit_type=config.collator.unit_type, vocab_filepath=None)
+
+    def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
+        result_transcripts = self.model.decode(
+            audio,
+            audio_len,
+            vocab_list,
+            decoding_method=cfg.decoding_method,
+            lang_model_path=cfg.lang_model_path,
+            beam_alpha=cfg.alpha,
+            beam_beta=cfg.beta,
+            beam_size=cfg.beam_size,
+            cutoff_prob=cfg.cutoff_prob,
+            cutoff_top_n=cfg.cutoff_top_n,
+            num_processes=cfg.num_proc_bsearch)
+        #replace the '<space>' with ' '
+        result_transcripts = [
+            self._text_featurizer.detokenize(sentence)
+            for sentence in result_transcripts
+        ]
+
+        return result_transcripts
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        self.model.eval()
+        cfg = self.config
+        audio_file = self.audio_file
+        collate_fn_test = self.collate_fn_test
+        audio, _ = collate_fn_test.process_utterance(
+            audio_file=audio_file, transcript=" ")
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+        vocab_list = collate_fn_test.vocab_list
+        result_transcripts = self.compute_result_transcripts(
+            audio, audio_len, vocab_list, cfg.decoding)
+        logger.info("result_transcripts: " + result_transcripts[0])
+
+    def run_test(self):
+        self.resume()
+        try:
+            self.test()
+        except KeyboardInterrupt:
+            exit(-1)
+
+    def setup(self):
+        """Setup the experiment.
+        """
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
+
+        self.setup_output_dir()
+        self.setup_checkpointer()
+
+        self.setup_model()
+
+    def setup_output_dir(self):
+        """Create a directory used for output.
+        """
+        # output dir
+        if self.args.output:
+            output_dir = Path(self.args.output).expanduser()
+            output_dir.mkdir(parents=True, exist_ok=True)
+        else:
+            output_dir = Path(
+                self.args.checkpoint_path).expanduser().parent.parent
+            output_dir.mkdir(parents=True, exist_ok=True)
+        self.output_dir = output_dir
+
+    def setup_model(self):
+        config = self.config.clone()
+        with UpdateConfig(config):
+            config.model.feat_size = self.collate_fn_test.feature_size
+            config.model.dict_size = self.collate_fn_test.vocab_size
+
+        if self.args.model_type == 'offline':
+            model = DeepSpeech2Model.from_config(config.model)
+        elif self.args.model_type == 'online':
+            model = DeepSpeech2ModelOnline.from_config(config.model)
+        else:
+            raise Exception("wrong model type")
+
+        self.model = model
+
+    def setup_checkpointer(self):
+        """Create a directory used to save checkpoints into.
+
+        It is "checkpoints" inside the output directory.
+        """
+        # checkpoint dir
+        checkpoint_dir = self.output_dir / "checkpoints"
+        checkpoint_dir.mkdir(exist_ok=True)
+
+        self.checkpoint_dir = checkpoint_dir
+
+        self.checkpoint = Checkpoint(
+            kbest_n=self.config.training.checkpoint.kbest_n,
+            latest_n=self.config.training.checkpoint.latest_n)
+
+    def resume(self):
+        """Resume from the checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+        """
+        params_path = self.args.checkpoint_path + ".pdparams"
+        model_dict = paddle.load(params_path)
+        self.model.set_state_dict(model_dict)
+
+
+def main_sp(config, args):
+    exp = DeepSpeech2Tester_hub(config, args)
+    exp.setup()
+    exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument("--model_type")
+    parser.add_argument("--audio_file")
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    if args.model_type is None:
+        args.model_type = 'offline'
+    if not os.path.isfile(args.audio_file):
+        print("Please input the audio file path")
+        sys.exit(-1)
+    print("model_type:{}".format(args.model_type))
+
+    # https://yaml.org/type/float.html
+    config = get_cfg_defaults(args.model_type)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -235,16 +235,18 @@ class DeepSpeech2Trainer(Trainer):
            num_workers=config.collator.num_workers)
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=int(config.collator.batch_size / 4),
+            batch_size=int(config.collator.batch_size),
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
+            num_workers=config.collator.num_workers)
        self.test_loader = DataLoader(
            test_dataset,
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn_test)
+            collate_fn=collate_fn_test,
+            num_workers=config.collator.num_workers)
        logger.info("Setup train/valid/test  Dataloader!")


--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -216,6 +216,7 @@ class U2Trainer(Trainer):
                            msg += f"{v:>.8f}" if isinstance(v,
                                                             float) else f"{v}"
                            msg += ","
+                        msg = msg[:-1]  # remove the last ","
                        if (batch_index + 1
                            ) % self.config.training.log_interval == 0:
                            logger.info(msg)
@ -243,6 +244,7 @@ class U2Trainer(Trainer):
                self.visualizer.add_scalars(
                    'epoch', {'cv_loss': cv_loss,
                              'lr': self.lr_scheduler()}, self.epoch)
+
            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
            self.new_epoch()

@ -291,7 +293,8 @@ class U2Trainer(Trainer):
            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
+            num_workers=config.collator.num_workers, )

        # test dataset, return raw text
        config.data.manifest = config.data.test_manifest
@ -313,7 +316,8 @@ class U2Trainer(Trainer):
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=SpeechCollator.from_config(config))
+            collate_fn=SpeechCollator.from_config(config),
+            num_workers=config.collator.num_workers, )
        # return text token id
        config.collator.keep_transcription_text = False
        self.align_loader = DataLoader(
@ -321,7 +325,8 @@ class U2Trainer(Trainer):
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=SpeechCollator.from_config(config))
+            collate_fn=SpeechCollator.from_config(config),
+            num_workers=config.collator.num_workers, )
        logger.info("Setup train/valid/test/align Dataloader!")

    def setup_model(self):
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@ -28,12 +28,9 @@ from paddle import distributed as dist
 from paddle.io import DataLoader
 from yacs.config import CfgNode

-from deepspeech.io.collator_st import KaldiPrePorocessedCollator
-from deepspeech.io.collator_st import SpeechCollator
-from deepspeech.io.collator_st import TripletKaldiPrePorocessedCollator
-from deepspeech.io.collator_st import TripletSpeechCollator
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.io.collator import TripletSpeechCollator
 from deepspeech.io.dataset import ManifestDataset
-from deepspeech.io.dataset import TripletManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2_st import U2STModel
@ -251,29 +248,19 @@ class U2STTrainer(Trainer):
        config.collator.keep_transcription_text = False

        # train/valid dataset, return token ids
-        Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset
        config.data.manifest = config.data.train_manifest
-        train_dataset = Dataset.from_config(config)
+        train_dataset = ManifestDataset.from_config(config)

        config.data.manifest = config.data.dev_manifest
-        dev_dataset = Dataset.from_config(config)
+        dev_dataset = ManifestDataset.from_config(config)

-        if config.collator.raw_wav:
-            if config.model.model_conf.asr_weight > 0.:
-                Collator = TripletSpeechCollator
-                TestCollator = SpeechCollator
-            else:
-                TestCollator = Collator = SpeechCollator
-            # Not yet implement the mtl loader for raw_wav.
+        if config.model.model_conf.asr_weight > 0.:
+            Collator = TripletSpeechCollator
+            TestCollator = SpeechCollator
        else:
-            if config.model.model_conf.asr_weight > 0.:
-                Collator = TripletKaldiPrePorocessedCollator
-                TestCollator = KaldiPrePorocessedCollator
-            else:
-                TestCollator = Collator = KaldiPrePorocessedCollator
+            TestCollator = Collator = SpeechCollator

        collate_fn_train = Collator.from_config(config)
-
        config.collator.augmentation_config = ""
        collate_fn_dev = Collator.from_config(config)

@ -305,7 +292,8 @@ class U2STTrainer(Trainer):
            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
+            num_workers=config.collator.num_workers, )

        # test dataset, return raw text
        config.data.manifest = config.data.test_manifest
@ -326,7 +314,8 @@ class U2STTrainer(Trainer):
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=TestCollator.from_config(config))
+            collate_fn=TestCollator.from_config(config),
+            num_workers=config.collator.num_workers, )
        # return text token id
        config.collator.keep_transcription_text = False
        self.align_loader = DataLoader(
@ -334,7 +323,8 @@ class U2STTrainer(Trainer):
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=TestCollator.from_config(config))
+            collate_fn=TestCollator.from_config(config),
+            num_workers=config.collator.num_workers, )
        logger.info("Setup train/valid/test/align Dataloader!")

    def setup_model(self):
--- a/deepspeech/frontend/audio.py
+++ b/deepspeech/frontend/audio.py
@ -24,8 +24,10 @@ import soundfile
 import soxbindings as sox
 from scipy import signal

+from .utility import subfile_from_tar

-class AudioSegment(object):
+
+class AudioSegment():
    """Monaural audio segment abstraction.

    :param samples: Audio samples [num_samples x num_channels].
@ -68,16 +70,20 @@ class AudioSegment(object):
                                self.duration, self.rms_db))

    @classmethod
-    def from_file(cls, file):
+    def from_file(cls, file, infos=None):
        """Create audio segment from audio file.
-        
-        :param filepath: Filepath or file object to audio file.
-        :type filepath: str|file
-        :return: Audio segment instance.
-        :rtype: AudioSegment
+
+        Args:
+            filepath (str|file): Filepath or file object to audio file.
+            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
+
+        Returns:
+            AudioSegment: Audio segment instance.
        """
        if isinstance(file, str) and re.findall(r".seqbin_\d+$", file):
            return cls.from_sequence_file(file)
+        elif isinstance(file, str) and file.startswith('tar:'):
+            return cls.from_file(subfile_from_tar(file, infos))
        else:
            samples, sample_rate = soundfile.read(file, dtype='float32')
            return cls(samples, sample_rate)
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@ -24,15 +24,15 @@ class AudioFeaturizer():

    Currently, it supports feature types of linear spectrogram and mfcc.

-    :param specgram_type: Specgram feature type. Options: 'linear'.
-    :type specgram_type: str
+    :param spectrum_type: Specgram feature type. Options: 'linear'.
+    :type spectrum_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
+    :param max_freq: When spectrum_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_feq is the
+                     returned; when spectrum_type is 'mfcc', max_feq is the
                     highest band edge of mel filters.
    :types max_freq: None|float
    :param target_sample_rate: Audio are resampled (if upsampling or
@ -47,7 +47,7 @@ class AudioFeaturizer():
    """

    def __init__(self,
-                 specgram_type: str='linear',
+                 spectrum_type: str='linear',
                 feat_dim: int=None,
                 delta_delta: bool=False,
                 stride_ms=10.0,
@ -58,7 +58,7 @@ class AudioFeaturizer():
                 use_dB_normalization=True,
                 target_dB=-20,
                 dither=1.0):
-        self._specgram_type = specgram_type
+        self._spectrum_type = spectrum_type
        # mfcc and fbank using `feat_dim`
        self._feat_dim = feat_dim
        # mfcc and fbank using `delta-delta`
@ -113,27 +113,27 @@ class AudioFeaturizer():
    def feature_size(self):
        """audio feature size"""
        feat_dim = 0
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
            fft_point = self._window_ms if self._fft_point is None else self._fft_point
            feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
                           1)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
            # mfcc, delta, delta-delta
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
            # fbank, delta, delta-delta
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
        else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
        return feat_dim

    def _compute_specgram(self, audio_segment):
        """Extract various audio features."""
        sample_rate = audio_segment.sample_rate
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
            samples = audio_segment.samples
            return self._compute_linear_specgram(
                samples,
@ -141,7 +141,7 @@ class AudioFeaturizer():
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
            samples = audio_segment.to('int16')
            return self._compute_mfcc(
                samples,
@ -152,7 +152,7 @@ class AudioFeaturizer():
                max_freq=self._max_freq,
                dither=self._dither,
                delta_delta=self._delta_delta)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
            samples = audio_segment.to('int16')
            return self._compute_fbank(
                samples,
@ -164,8 +164,8 @@ class AudioFeaturizer():
                dither=self._dither,
                delta_delta=self._delta_delta)
        else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)

    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
        """Compute the spectrogram for samples from a real signal."""
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer


 class SpeechFeaturizer():
-    """Speech featurizer, for extracting features from both audio and transcript
-    contents of SpeechSegment.
-
-    Currently, for audio parts, it supports feature types of linear
-    spectrogram and mfcc; for transcript parts, it only supports char-level
-    tokenizing and conversion into a list of token indices. Note that the
-    token indexing order follows the given vocabulary file.
-
-    :param vocab_filepath: Filepath to load vocabulary for token indices
-                           conversion.
-    :type specgram_type: str
-    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
-    :type specgram_type: str
-    :param stride_ms: Striding size (in milliseconds) for generating frames.
-    :type stride_ms: float
-    :param window_ms: Window size (in milliseconds) for generating frames.
-    :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
-                     corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_freq is the
-                     highest band edge of mel filters.
-    :types max_freq: None|float
-    :param target_sample_rate: Speech are resampled (if upsampling or
-                               downsampling is allowed) to this before
-                               extracting spectrogram features.
-    :type target_sample_rate: float
-    :param use_dB_normalization: Whether to normalize the audio to a certain
-                                 decibels before extracting the features.
-    :type use_dB_normalization: bool
-    :param target_dB: Target audio decibels for normalization.
-    :type target_dB: float
+    """Speech and Text feature extraction.
    """

    def __init__(self,
                 unit_type,
                 vocab_filepath,
                 spm_model_prefix=None,
-                 specgram_type='linear',
+                 spectrum_type='linear',
                 feat_dim=None,
                 delta_delta=False,
                 stride_ms=10.0,
@ -64,9 +34,13 @@ class SpeechFeaturizer():
                 target_sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20,
-                 dither=1.0):
-        self._audio_featurizer = AudioFeaturizer(
-            specgram_type=specgram_type,
+                 dither=1.0,
+                 maskctc=False):
+        self.stride_ms = stride_ms
+        self.window_ms = window_ms
+
+        self.audio_feature = AudioFeaturizer(
+            spectrum_type=spectrum_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
            stride_ms=stride_ms,
@ -77,8 +51,12 @@ class SpeechFeaturizer():
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB,
            dither=dither)
-        self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath,
-                                               spm_model_prefix)
+
+        self.text_feature = TextFeaturizer(
+            unit_type=unit_type,
+            vocab_filepath=vocab_filepath,
+            spm_model_prefix=spm_model_prefix,
+            maskctc=maskctc)

    def featurize(self, speech_segment, keep_transcription_text):
        """Extract features for speech segment.
@ -94,60 +72,33 @@ class SpeechFeaturizer():
        Returns:
            tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
        """
-        spec_feature = self._audio_featurizer.featurize(speech_segment)
+        spec_feature = self.audio_feature.featurize(speech_segment)
+
        if keep_transcription_text:
            return spec_feature, speech_segment.transcript
+
        if speech_segment.has_token:
            text_ids = speech_segment.token_ids
        else:
-            text_ids = self._text_featurizer.featurize(
-                speech_segment.transcript)
+            text_ids = self.text_feature.featurize(speech_segment.transcript)
        return spec_feature, text_ids

-    @property
-    def vocab_size(self):
-        """Return the vocabulary size.
-        Returns:
-            int: Vocabulary size.
-        """
-        return self._text_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        """Return the vocabulary in list.
-        Returns:
-            List[str]: 
-        """
-        return self._text_featurizer.vocab_list
+    def text_featurize(self, text, keep_transcription_text):
+        """Extract features for speech segment.

-    @property
-    def vocab_dict(self):
-        """Return the vocabulary in dict.
-        Returns:
-            Dict[str, int]: 
-        """
-        return self._text_featurizer.vocab_dict
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, keep the original text or convert text string
+           to a list of token indices in char-level.

-    @property
-    def feature_size(self):
-        """Return the audio feature size.
-        Returns:
-            int: audio feature size.
-        """
-        return self._audio_featurizer.feature_size
+        Args:
+            text (str): text.
+            keep_transcription_text (bool): True, keep transcript text, False, token ids

-    @property
-    def stride_ms(self):
-        """time length in `ms` unit per frame
        Returns:
-            float: time(ms)/frame
+            (str|List[int]): text, or list of token indices.
        """
-        return self._audio_featurizer.stride_ms
+        if keep_transcription_text:
+            return text

-    @property
-    def text_feature(self):
-        """Return the text feature object.
-        Returns:
-            TextFeaturizer: object.
-        """
-        return self._text_featurizer
+        text_ids = self.text_feature.featurize(text)
+        return text_ids
--- a/deepspeech/frontend/speech.py
+++ b/deepspeech/frontend/speech.py
@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment):
        return not self.__eq__(other)

    @classmethod
-    def from_file(cls, filepath, transcript, tokens=None, token_ids=None):
+    def from_file(cls,
+                  filepath,
+                  transcript,
+                  tokens=None,
+                  token_ids=None,
+                  infos=None):
        """Create speech segment from audio file and corresponding transcript.

        Args:
@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment):
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.
+            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.

        Returns:
            SpeechSegment: Speech segment instance.
        """
-
-        audio = AudioSegment.from_file(filepath)
+        audio = AudioSegment.from_file(filepath, infos)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)

--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@ -14,6 +14,8 @@
 """Contains data helper functions."""
 import json
 import math
+import tarfile
+from collections import namedtuple
 from typing import List
 from typing import Optional
 from typing import Text
@ -112,6 +114,51 @@ def read_manifest(
    return manifest


+# Tar File read
+TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
+
+
+def parse_tar(file):
+    """Parse a tar file to get a tarfile object
+    and a map containing tarinfoes
+    """
+    result = {}
+    f = tarfile.open(file)
+    for tarinfo in f.getmembers():
+        result[tarinfo.name] = tarinfo
+    return f, result
+
+
+def subfile_from_tar(file, local_data=None):
+    """Get subfile object from tar.
+
+    tar:tarpath#filename
+
+    It will return a subfile object from tar file
+    and cached tar file info for next reading request.
+    """
+    tarpath, filename = file.split(':', 1)[1].split('#', 1)
+
+    if local_data is None:
+        local_data = TarLocalData(tar2info={}, tar2object={})
+
+    assert isinstance(local_data, TarLocalData)
+
+    if 'tar2info' not in local_data.__dict__:
+        local_data.tar2info = {}
+    if 'tar2object' not in local_data.__dict__:
+        local_data.tar2object = {}
+
+    if tarpath not in local_data.tar2info:
+        fobj, infos = parse_tar(tarpath)
+        local_data.tar2info[tarpath] = infos
+        local_data.tar2object[tarpath] = fobj
+    else:
+        fobj = local_data.tar2object[tarpath]
+        infos = local_data.tar2info[tarpath]
+    return fobj.extractfile(infos[filename])
+
+
 def rms_to_db(rms: float):
    """Root Mean Square to dB.

--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
-from collections import namedtuple
 from typing import Optional

 import numpy as np
@ -23,96 +22,30 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
+from deepspeech.frontend.utility import TarLocalData
+from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log

-__all__ = ["SpeechCollator"]
+__all__ = ["SpeechCollator", "TripletSpeechCollator"]

 logger = Log(__name__).getlog()

-# namedtupe need global for pickle.
-TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])

+def tokenids(text, keep_transcription_text):
+    # for training text is token ids 
+    tokens = text  # token ids

-class SpeechCollator():
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                augmentation_config="",
-                random_seed=0,
-                mean_std_filepath="",
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-                feat_dim=0,  # 'mfcc', 'fbank'
-                delta_delta=False,  # 'mfcc', 'fbank'
-                stride_ms=10.0,  # ms
-                window_ms=20.0,  # ms
-                n_fft=None,  # fft points
-                max_freq=None,  # None for samplerate/2
-                target_sample_rate=16000,  # target sample rate
-                use_dB_normalization=True,
-                target_dB=-20,
-                dither=1.0,  # feature dither
-                keep_transcription_text=False))
+    if keep_transcription_text:
+        # text is string, convert to unicode ord
+        assert isinstance(text, str), (type(text), text)
+        tokens = [ord(t) for t in text]

-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
+    tokens = np.array(tokens, dtype=np.int64)
+    return tokens

-    @classmethod
-    def from_config(cls, config):
-        """Build a SpeechCollator object from a config.
-
-        Args:
-            config (yacs.config.CfgNode): configs object.
-
-        Returns:
-            SpeechCollator: collator object.
-        """
-        assert 'augmentation_config' in config.collator
-        assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.collator
-        assert 'vocab_filepath' in config.collator
-        assert 'specgram_type' in config.collator
-        assert 'n_fft' in config.collator
-        assert config.collator
-
-        if isinstance(config.collator.augmentation_config, (str, bytes)):
-            if config.collator.augmentation_config:
-                aug_file = io.open(
-                    config.collator.augmentation_config,
-                    mode='r',
-                    encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.collator.augmentation_config
-            assert isinstance(aug_file, io.StringIO)
-
-        speech_collator = cls(
-            aug_file=aug_file,
-            random_seed=0,
-            mean_std_filepath=config.collator.mean_std_filepath,
-            unit_type=config.collator.unit_type,
-            vocab_filepath=config.collator.vocab_filepath,
-            spm_model_prefix=config.collator.spm_model_prefix,
-            specgram_type=config.collator.specgram_type,
-            feat_dim=config.collator.feat_dim,
-            delta_delta=config.collator.delta_delta,
-            stride_ms=config.collator.stride_ms,
-            window_ms=config.collator.window_ms,
-            n_fft=config.collator.n_fft,
-            max_freq=config.collator.max_freq,
-            target_sample_rate=config.collator.target_sample_rate,
-            use_dB_normalization=config.collator.use_dB_normalization,
-            target_dB=config.collator.target_dB,
-            dither=config.collator.dither,
-            keep_transcription_text=config.collator.keep_transcription_text)
-        return speech_collator

+class SpeechCollatorBase():
    def __init__(
            self,
            aug_file,
@ -121,7 +54,7 @@ class SpeechCollator():
            spm_model_prefix,
            random_seed=0,
            unit_type="char",
-            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+            spectrum_type='linear',  # 'linear', 'mfcc', 'fbank'
            feat_dim=0,  # 'mfcc', 'fbank'
            delta_delta=False,  # 'mfcc', 'fbank'
            stride_ms=10.0,  # ms
@ -146,7 +79,7 @@ class SpeechCollator():
            n_fft (int, optional): fft points for rfft. Defaults to None.
            max_freq (int, optional): max cut freq. Defaults to None.
            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
-            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
+            spectrum_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
@ -159,23 +92,27 @@ class SpeechCollator():
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one batch.
        """
-        self._keep_transcription_text = keep_transcription_text
+        self.keep_transcription_text = keep_transcription_text
+        self.stride_ms = stride_ms
+        self.window_ms = window_ms
+        self.feat_dim = feat_dim
+
+        self.loader = LoadInputsAndTargets()

+        # only for tar filetype
        self._local_data = TarLocalData(tar2info={}, tar2object={})
-        self._augmentation_pipeline = AugmentationPipeline(
+
+        self.augmentation = AugmentationPipeline(
            augmentation_config=aug_file.read(), random_seed=random_seed)

        self._normalizer = FeatureNormalizer(
            mean_std_filepath) if mean_std_filepath else None

-        self._stride_ms = stride_ms
-        self._target_sample_rate = target_sample_rate
-
        self._speech_featurizer = SpeechFeaturizer(
            unit_type=unit_type,
            vocab_filepath=vocab_filepath,
            spm_model_prefix=spm_model_prefix,
-            specgram_type=specgram_type,
+            spectrum_type=spectrum_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
            stride_ms=stride_ms,
@ -187,33 +124,11 @@ class SpeechCollator():
            target_dB=target_dB,
            dither=dither)

-    def _parse_tar(self, file):
-        """Parse a tar file to get a tarfile object
-        and a map containing tarinfoes
-        """
-        result = {}
-        f = tarfile.open(file)
-        for tarinfo in f.getmembers():
-            result[tarinfo.name] = tarinfo
-        return f, result
-
-    def _subfile_from_tar(self, file):
-        """Get subfile object from tar.
-
-        It will return a subfile object from tar file
-        and cached tar file info for next reading request.
-        """
-        tarpath, filename = file.split(':', 1)[1].split('#', 1)
-        if 'tar2info' not in self._local_data.__dict__:
-            self._local_data.tar2info = {}
-        if 'tar2object' not in self._local_data.__dict__:
-            self._local_data.tar2object = {}
-        if tarpath not in self._local_data.tar2info:
-            object, infoes = self._parse_tar(tarpath)
-            self._local_data.tar2info[tarpath] = infoes
-            self._local_data.tar2object[tarpath] = object
-        return self._local_data.tar2object[tarpath].extractfile(
-            self._local_data.tar2info[tarpath][filename])
+        self.feature_size = self._speech_featurizer.audio_feature.feature_size
+        self.text_feature = self._speech_featurizer.text_feature
+        self.vocab_dict = self.text_feature.vocab_dict
+        self.vocab_list = self.text_feature.vocab_list
+        self.vocab_size = self.text_feature.vocab_size

    def process_utterance(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.
@ -226,62 +141,69 @@ class SpeechCollator():
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), transcript)
-        else:
-            speech_segment = SpeechSegment.from_file(audio_file, transcript)
+        filetype = self.loader.file_type(audio_file)
+
+        if filetype != 'sound':
+            spectrum = self.loader._get_from_loader(audio_file, filetype)
+            feat_dim = spectrum.shape[1]
+            assert feat_dim == self.feat_dim, f"expect feat dim {self.feat_dim}, but got {feat_dim}"

-        # audio augment
-        self._augmentation_pipeline.transform_audio(speech_segment)
+            if self.keep_transcription_text:
+                transcript_part = transcript
+            else:
+                text_ids = self.text_feature.featurize(transcript)
+                transcript_part = text_ids
+        else:
+            # read audio
+            speech_segment = SpeechSegment.from_file(
+                audio_file, transcript, infos=self._local_data)
+            # audio augment
+            self.augmentation.transform_audio(speech_segment)

-        specgram, transcript_part = self._speech_featurizer.featurize(
-            speech_segment, self._keep_transcription_text)
-        if self._normalizer:
-            specgram = self._normalizer.apply(specgram)
+            # extract speech feature
+            spectrum, transcript_part = self._speech_featurizer.featurize(
+                speech_segment, self.keep_transcription_text)
+            # CMVN spectrum
+            if self._normalizer:
+                spectrum = self._normalizer.apply(spectrum)

-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        return specgram, transcript_part
+        # spectrum augment
+        spectrum = self.augmentation.transform_feature(spectrum)
+        return spectrum, transcript_part

    def __call__(self, batch):
        """batch examples

        Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)

        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : (B, Umax)
+                olens: (B,)
        """
        audios = []
        audio_lens = []
        texts = []
        text_lens = []
        utts = []
-        for utt, audio, text in batch:
+
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            text = item['text']
            audio, text = self.process_utterance(audio, text)
-            #utt
-            utts.append(utt)
-            # audio
+
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
-            tokens = []
-            if self._keep_transcription_text:
-                assert isinstance(text, str), (type(text), text)
-                tokens = [ord(t) for t in text]
-            else:
-                tokens = text  # token ids
-            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
-                tokens, dtype=np.int64)
+
+            tokens = tokenids(text, self.keep_transcription_text)
            texts.append(tokens)
            text_lens.append(tokens.shape[0])

@ -292,26 +214,161 @@ class SpeechCollator():
        olens = np.array(text_lens).astype(np.int64)
        return utts, xs_pad, ilens, ys_pad, olens

-    @property
-    def vocab_size(self):
-        return self._speech_featurizer.vocab_size

-    @property
-    def vocab_list(self):
-        return self._speech_featurizer.vocab_list
+class SpeechCollator(SpeechCollatorBase):
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        default = CfgNode(
+            dict(
+                augmentation_config="",
+                random_seed=0,
+                mean_std_filepath="",
+                unit_type="char",
+                vocab_filepath="",
+                spm_model_prefix="",
+                spectrum_type='linear',  # 'linear', 'mfcc', 'fbank'
+                feat_dim=0,  # 'mfcc', 'fbank'
+                delta_delta=False,  # 'mfcc', 'fbank'
+                stride_ms=10.0,  # ms
+                window_ms=20.0,  # ms
+                n_fft=None,  # fft points
+                max_freq=None,  # None for samplerate/2
+                target_sample_rate=16000,  # target sample rate
+                use_dB_normalization=True,
+                target_dB=-20,
+                dither=1.0,  # feature dither
+                keep_transcription_text=False))
+
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
+    @classmethod
+    def from_config(cls, config):
+        """Build a SpeechCollator object from a config.

-    @property
-    def vocab_dict(self):
-        return self._speech_featurizer.vocab_dict
+        Args:
+            config (yacs.config.CfgNode): configs object.

-    @property
-    def text_feature(self):
-        return self._speech_featurizer.text_feature
+        Returns:
+            SpeechCollator: collator object.
+        """
+        assert 'augmentation_config' in config.collator
+        assert 'keep_transcription_text' in config.collator
+        assert 'mean_std_filepath' in config.collator
+        assert 'vocab_filepath' in config.collator
+        assert 'spectrum_type' in config.collator
+        assert 'n_fft' in config.collator
+        assert config.collator

-    @property
-    def feature_size(self):
-        return self._speech_featurizer.feature_size
+        if isinstance(config.collator.augmentation_config, (str, bytes)):
+            if config.collator.augmentation_config:
+                aug_file = io.open(
+                    config.collator.augmentation_config,
+                    mode='r',
+                    encoding='utf8')
+            else:
+                aug_file = io.StringIO(initial_value='{}', newline='')
+        else:
+            aug_file = config.collator.augmentation_config
+            assert isinstance(aug_file, io.StringIO)

-    @property
-    def stride_ms(self):
-        return self._speech_featurizer.stride_ms
+        speech_collator = cls(
+            aug_file=aug_file,
+            random_seed=0,
+            mean_std_filepath=config.collator.mean_std_filepath,
+            unit_type=config.collator.unit_type,
+            vocab_filepath=config.collator.vocab_filepath,
+            spm_model_prefix=config.collator.spm_model_prefix,
+            spectrum_type=config.collator.spectrum_type,
+            feat_dim=config.collator.feat_dim,
+            delta_delta=config.collator.delta_delta,
+            stride_ms=config.collator.stride_ms,
+            window_ms=config.collator.window_ms,
+            n_fft=config.collator.n_fft,
+            max_freq=config.collator.max_freq,
+            target_sample_rate=config.collator.target_sample_rate,
+            use_dB_normalization=config.collator.use_dB_normalization,
+            target_dB=config.collator.target_dB,
+            dither=config.collator.dither,
+            keep_transcription_text=config.collator.keep_transcription_text)
+        return speech_collator
+
+
+class TripletSpeechCollator(SpeechCollator):
+    def process_utterance(self, audio_file, translation, transcript):
+        """Load, augment, featurize and normalize for speech data.
+
+        :param audio_file: Filepath or file object of audio file.
+        :type audio_file: str | file
+        :param translation: translation text.
+        :type translation: str
+        :return: Tuple of audio feature tensor and data of translation part,
+                    where translation part could be token ids or text.
+        :rtype: tuple of (2darray, list)
+        """
+        spectrum, translation_part = super().process_utterance(audio_file,
+                                                               translation)
+        transcript_part = self._speech_featurizer.text_featurize(
+            transcript, self.keep_transcription_text)
+        return spectrum, translation_part, transcript_part
+
+    def __call__(self, batch):
+        """batch examples
+
+        Args:
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
+                audio (np.ndarray) shape (T, D)
+                text (List[int] or str): shape (U,)
+
+        Returns:
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : [(B, Umax), (B, Umax)]
+                olens: [(B,), (B,)]
+        """
+        utts = []
+        audios = []
+        audio_lens = []
+        translation_text = []
+        translation_text_lens = []
+        transcription_text = []
+        transcription_text_lens = []
+
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            translation = item['text']
+            transcription = item['text1']
+            audio, translation, transcription = self.process_utterance(
+                audio, translation, transcription)
+
+            audios.append(audio)  # [T, D]
+            audio_lens.append(audio.shape[0])
+
+            tokens = [[], []]
+            for idx, text in enumerate([translation, transcription]):
+                tokens[idx] = tokenids(text, self.keep_transcription_text)
+
+            translation_text.append(tokens[0])
+            translation_text_lens.append(tokens[0].shape[0])
+            transcription_text.append(tokens[1])
+            transcription_text_lens.append(tokens[1].shape[0])
+
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)  #[B, T, D]
+        ilens = np.array(audio_lens).astype(np.int64)
+
+        padded_translation = pad_list(translation_text,
+                                      IGNORE_ID).astype(np.int64)
+        translation_lens = np.array(translation_text_lens).astype(np.int64)
+
+        padded_transcription = pad_list(transcription_text,
+                                        IGNORE_ID).astype(np.int64)
+        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
+
+        ys_pad = (padded_translation, padded_transcription)
+        olens = (translation_lens, transcription_lens)
+        return utts, xs_pad, ilens, ys_pad, olens
--- a/deepspeech/io/collator_st.py
+++ b/deepspeech/io/collator_st.py
@ -1,631 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import io
-from collections import namedtuple
-from typing import Optional
-
-import kaldiio
-import numpy as np
-from yacs.config import CfgNode
-
-from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
-from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
-from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
-from deepspeech.frontend.normalizer import FeatureNormalizer
-from deepspeech.frontend.speech import SpeechSegment
-from deepspeech.frontend.utility import IGNORE_ID
-from deepspeech.io.utility import pad_sequence
-from deepspeech.utils.log import Log
-
-__all__ = ["SpeechCollator", "KaldiPrePorocessedCollator"]
-
-logger = Log(__name__).getlog()
-
-# namedtupe need global for pickle.
-TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
-
-
-class SpeechCollator():
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                augmentation_config="",
-                random_seed=0,
-                mean_std_filepath="",
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-                feat_dim=0,  # 'mfcc', 'fbank'
-                delta_delta=False,  # 'mfcc', 'fbank'
-                stride_ms=10.0,  # ms
-                window_ms=20.0,  # ms
-                n_fft=None,  # fft points
-                max_freq=None,  # None for samplerate/2
-                target_sample_rate=16000,  # target sample rate
-                use_dB_normalization=True,
-                target_dB=-20,
-                dither=1.0,  # feature dither
-                keep_transcription_text=False))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
-    @classmethod
-    def from_config(cls, config):
-        """Build a SpeechCollator object from a config.
-
-        Args:
-            config (yacs.config.CfgNode): configs object.
-
-        Returns:
-            SpeechCollator: collator object.
-        """
-        assert 'augmentation_config' in config.collator
-        assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.collator
-        assert 'vocab_filepath' in config.collator
-        assert 'specgram_type' in config.collator
-        assert 'n_fft' in config.collator
-        assert config.collator
-
-        if isinstance(config.collator.augmentation_config, (str, bytes)):
-            if config.collator.augmentation_config:
-                aug_file = io.open(
-                    config.collator.augmentation_config,
-                    mode='r',
-                    encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.collator.augmentation_config
-            assert isinstance(aug_file, io.StringIO)
-
-        speech_collator = cls(
-            aug_file=aug_file,
-            random_seed=0,
-            mean_std_filepath=config.collator.mean_std_filepath,
-            unit_type=config.collator.unit_type,
-            vocab_filepath=config.collator.vocab_filepath,
-            spm_model_prefix=config.collator.spm_model_prefix,
-            specgram_type=config.collator.specgram_type,
-            feat_dim=config.collator.feat_dim,
-            delta_delta=config.collator.delta_delta,
-            stride_ms=config.collator.stride_ms,
-            window_ms=config.collator.window_ms,
-            n_fft=config.collator.n_fft,
-            max_freq=config.collator.max_freq,
-            target_sample_rate=config.collator.target_sample_rate,
-            use_dB_normalization=config.collator.use_dB_normalization,
-            target_dB=config.collator.target_dB,
-            dither=config.collator.dither,
-            keep_transcription_text=config.collator.keep_transcription_text)
-        return speech_collator
-
-    def __init__(
-            self,
-            aug_file,
-            mean_std_filepath,
-            vocab_filepath,
-            spm_model_prefix,
-            random_seed=0,
-            unit_type="char",
-            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-            feat_dim=0,  # 'mfcc', 'fbank'
-            delta_delta=False,  # 'mfcc', 'fbank'
-            stride_ms=10.0,  # ms
-            window_ms=20.0,  # ms
-            n_fft=None,  # fft points
-            max_freq=None,  # None for samplerate/2
-            target_sample_rate=16000,  # target sample rate
-            use_dB_normalization=True,
-            target_dB=-20,
-            dither=1.0,
-            keep_transcription_text=True):
-        """SpeechCollator Collator
-
-        Args:
-            unit_type(str): token unit type, e.g. char, word, spm
-            vocab_filepath (str): vocab file path.
-            mean_std_filepath (str): mean and std file path, which suffix is *.npy
-            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
-            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
-            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
-            window_ms (float, optional): window size in ms. Defaults to 20.0.
-            n_fft (int, optional): fft points for rfft. Defaults to None.
-            max_freq (int, optional): max cut freq. Defaults to None.
-            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
-            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
-            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
-            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
-            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
-            target_dB (int, optional): target dB. Defaults to -20.
-            random_seed (int, optional): for random generator. Defaults to 0.
-            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
-            if ``keep_transcription_text`` is False, text is token ids else is raw string.
-
-        Do augmentations
-        Padding audio features with zeros to make them have the same shape (or
-        a user-defined shape) within one batch.
-        """
-        self._keep_transcription_text = keep_transcription_text
-
-        self._local_data = TarLocalData(tar2info={}, tar2object={})
-        self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=aug_file.read(), random_seed=random_seed)
-
-        self._normalizer = FeatureNormalizer(
-            mean_std_filepath) if mean_std_filepath else None
-
-        self._stride_ms = stride_ms
-        self._target_sample_rate = target_sample_rate
-
-        self._speech_featurizer = SpeechFeaturizer(
-            unit_type=unit_type,
-            vocab_filepath=vocab_filepath,
-            spm_model_prefix=spm_model_prefix,
-            specgram_type=specgram_type,
-            feat_dim=feat_dim,
-            delta_delta=delta_delta,
-            stride_ms=stride_ms,
-            window_ms=window_ms,
-            n_fft=n_fft,
-            max_freq=max_freq,
-            target_sample_rate=target_sample_rate,
-            use_dB_normalization=use_dB_normalization,
-            target_dB=target_dB,
-            dither=dither)
-
-    def _parse_tar(self, file):
-        """Parse a tar file to get a tarfile object
-        and a map containing tarinfoes
-        """
-        result = {}
-        f = tarfile.open(file)
-        for tarinfo in f.getmembers():
-            result[tarinfo.name] = tarinfo
-        return f, result
-
-    def _subfile_from_tar(self, file):
-        """Get subfile object from tar.
-
-        It will return a subfile object from tar file
-        and cached tar file info for next reading request.
-        """
-        tarpath, filename = file.split(':', 1)[1].split('#', 1)
-        if 'tar2info' not in self._local_data.__dict__:
-            self._local_data.tar2info = {}
-        if 'tar2object' not in self._local_data.__dict__:
-            self._local_data.tar2object = {}
-        if tarpath not in self._local_data.tar2info:
-            object, infoes = self._parse_tar(tarpath)
-            self._local_data.tar2info[tarpath] = infoes
-            self._local_data.tar2object[tarpath] = object
-        return self._local_data.tar2object[tarpath].extractfile(
-            self._local_data.tar2info[tarpath][filename])
-
-    @property
-    def manifest(self):
-        return self._manifest
-
-    @property
-    def vocab_size(self):
-        return self._speech_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        return self._speech_featurizer.vocab_list
-
-    @property
-    def vocab_dict(self):
-        return self._speech_featurizer.vocab_dict
-
-    @property
-    def text_feature(self):
-        return self._speech_featurizer.text_feature
-
-    @property
-    def feature_size(self):
-        return self._speech_featurizer.feature_size
-
-    @property
-    def stride_ms(self):
-        return self._speech_featurizer.stride_ms
-
-    def process_utterance(self, audio_file, translation):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of audio file.
-        :type audio_file: str | file
-        :param translation: translation text.
-        :type translation: str
-        :return: Tuple of audio feature tensor and data of translation part,
-                 where translation part could be token ids or text.
-        :rtype: tuple of (2darray, list)
-        """
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), translation)
-        else:
-            speech_segment = SpeechSegment.from_file(audio_file, translation)
-
-        # audio augment
-        self._augmentation_pipeline.transform_audio(speech_segment)
-
-        specgram, translation_part = self._speech_featurizer.featurize(
-            speech_segment, self._keep_transcription_text)
-        if self._normalizer:
-            specgram = self._normalizer.apply(specgram)
-
-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        return specgram, translation_part
-
-    def __call__(self, batch):
-        """batch examples
-
-        Args:
-            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (T, D)
-                text (List[int] or str): shape (U,)
-
-        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
-        """
-        audios = []
-        audio_lens = []
-        texts = []
-        text_lens = []
-        utts = []
-        for utt, audio, text in batch:
-            audio, text = self.process_utterance(audio, text)
-            #utt
-            utts.append(utt)
-            # audio
-            audios.append(audio)  # [T, D]
-            audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
-            tokens = []
-            if self._keep_transcription_text:
-                assert isinstance(text, str), (type(text), text)
-                tokens = [ord(t) for t in text]
-            else:
-                tokens = text  # token ids
-            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
-                tokens, dtype=np.int64)
-            texts.append(tokens)
-            text_lens.append(tokens.shape[0])
-
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_texts = pad_sequence(
-            texts, padding_value=IGNORE_ID).astype(np.int64)
-        text_lens = np.array(text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, padded_texts, text_lens
-
-
-class TripletSpeechCollator(SpeechCollator):
-    def process_utterance(self, audio_file, translation, transcript):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of audio file.
-        :type audio_file: str | file
-        :param translation: translation text.
-        :type translation: str
-        :return: Tuple of audio feature tensor and data of translation part,
-                    where translation part could be token ids or text.
-        :rtype: tuple of (2darray, list)
-        """
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), translation)
-        else:
-            speech_segment = SpeechSegment.from_file(audio_file, translation)
-
-        # audio augment
-        self._augmentation_pipeline.transform_audio(speech_segment)
-
-        specgram, translation_part = self._speech_featurizer.featurize(
-            speech_segment, self._keep_transcription_text)
-        transcript_part = self._speech_featurizer._text_featurizer.featurize(
-            transcript)
-        if self._normalizer:
-            specgram = self._normalizer.apply(specgram)
-
-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        return specgram, translation_part, transcript_part
-
-    def __call__(self, batch):
-        """batch examples
-
-        Args:
-            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (T, D)
-                text (List[int] or str): shape (U,)
-
-        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
-        """
-        audios = []
-        audio_lens = []
-        translation_text = []
-        translation_text_lens = []
-        transcription_text = []
-        transcription_text_lens = []
-
-        utts = []
-        for utt, audio, translation, transcription in batch:
-            audio, translation, transcription = self.process_utterance(
-                audio, translation, transcription)
-            #utt
-            utts.append(utt)
-            # audio
-            audios.append(audio)  # [T, D]
-            audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
-            tokens = [[], []]
-            for idx, text in enumerate([translation, transcription]):
-                if self._keep_transcription_text:
-                    assert isinstance(text, str), (type(text), text)
-                    tokens[idx] = [ord(t) for t in text]
-                else:
-                    tokens[idx] = text  # token ids
-                tokens[idx] = tokens[idx] if isinstance(
-                    tokens[idx], np.ndarray) else np.array(
-                        tokens[idx], dtype=np.int64)
-            translation_text.append(tokens[0])
-            translation_text_lens.append(tokens[0].shape[0])
-            transcription_text.append(tokens[1])
-            transcription_text_lens.append(tokens[1].shape[0])
-
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_translation = pad_sequence(
-            translation_text, padding_value=IGNORE_ID).astype(np.int64)
-        translation_lens = np.array(translation_text_lens).astype(np.int64)
-        padded_transcription = pad_sequence(
-            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
-        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, (
-            padded_translation, padded_transcription), (translation_lens,
-                                                        transcription_lens)
-
-
-class KaldiPrePorocessedCollator(SpeechCollator):
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                augmentation_config="",
-                random_seed=0,
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                feat_dim=0,
-                stride_ms=10.0,
-                keep_transcription_text=False))
-
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-
-    @classmethod
-    def from_config(cls, config):
-        """Build a SpeechCollator object from a config.
-
-        Args:
-            config (yacs.config.CfgNode): configs object.
-
-        Returns:
-            SpeechCollator: collator object.
-        """
-        assert 'augmentation_config' in config.collator
-        assert 'keep_transcription_text' in config.collator
-        assert 'vocab_filepath' in config.collator
-        assert config.collator
-
-        if isinstance(config.collator.augmentation_config, (str, bytes)):
-            if config.collator.augmentation_config:
-                aug_file = io.open(
-                    config.collator.augmentation_config,
-                    mode='r',
-                    encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.collator.augmentation_config
-            assert isinstance(aug_file, io.StringIO)
-
-        speech_collator = cls(
-            aug_file=aug_file,
-            random_seed=0,
-            unit_type=config.collator.unit_type,
-            vocab_filepath=config.collator.vocab_filepath,
-            spm_model_prefix=config.collator.spm_model_prefix,
-            feat_dim=config.collator.feat_dim,
-            stride_ms=config.collator.stride_ms,
-            keep_transcription_text=config.collator.keep_transcription_text)
-        return speech_collator
-
-    def __init__(self,
-                 aug_file,
-                 vocab_filepath,
-                 spm_model_prefix,
-                 random_seed=0,
-                 unit_type="char",
-                 feat_dim=0,
-                 stride_ms=10.0,
-                 keep_transcription_text=True):
-        """SpeechCollator Collator
-
-        Args:
-            unit_type(str): token unit type, e.g. char, word, spm
-            vocab_filepath (str): vocab file path.
-            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
-            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
-            random_seed (int, optional): for random generator. Defaults to 0.
-            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
-            if ``keep_transcription_text`` is False, text is token ids else is raw string.
-
-        Do augmentations
-        Padding audio features with zeros to make them have the same shape (or
-        a user-defined shape) within one batch.
-        """
-        self._keep_transcription_text = keep_transcription_text
-        self._feat_dim = feat_dim
-        self._stride_ms = stride_ms
-
-        self._local_data = TarLocalData(tar2info={}, tar2object={})
-        self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=aug_file.read(), random_seed=random_seed)
-
-        self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath,
-                                               spm_model_prefix)
-
-    def process_utterance(self, audio_file, translation):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of kaldi processed feature.
-        :type audio_file: str | file
-        :param translation: Translation text.
-        :type translation: str
-        :return: Tuple of audio feature tensor and data of translation part,
-                 where translation part could be token ids or text.
-        :rtype: tuple of (2darray, list)
-        """
-        specgram = kaldiio.load_mat(audio_file)
-        assert specgram.shape[
-            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
-                self._feat_dim, specgram.shape[1])
-
-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-
-        if self._keep_transcription_text:
-            return specgram, translation
-        else:
-            text_ids = self._text_featurizer.featurize(translation)
-            return specgram, text_ids
-
-
-class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):
-    def process_utterance(self, audio_file, translation, transcript):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of kali processed feature.
-        :type audio_file: str | file
-        :param translation: Translation text.
-        :type translation: str
-        :param transcript: Transcription text.
-        :type transcript: str
-        :return: Tuple of audio feature tensor and data of translation and transcription parts,
-                 where translation and transcription parts could be token ids or text.
-        :rtype: tuple of (2darray, (list, list))
-        """
-        specgram = kaldiio.load_mat(audio_file)
-        assert specgram.shape[
-            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
-                self._feat_dim, specgram.shape[1])
-
-        # specgram augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-
-        if self._keep_transcription_text:
-            return specgram, translation, transcript
-        else:
-            translation_text_ids = self._text_featurizer.featurize(translation)
-            transcript_text_ids = self._text_featurizer.featurize(transcript)
-            return specgram, translation_text_ids, transcript_text_ids
-
-    def __call__(self, batch):
-        """batch examples
-
-        Args:
-            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (T, D)
-                translation (List[int] or str): shape (U,)
-                transcription (List[int] or str): shape (V,)
-
-        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                translation_text : (B, Umax)
-                translation_text_lens: (B)
-                transcription_text : (B, Vmax)
-                transcription_text_lens: (B)
-        """
-        audios = []
-        audio_lens = []
-        translation_text = []
-        translation_text_lens = []
-        transcription_text = []
-        transcription_text_lens = []
-
-        utts = []
-        for utt, audio, translation, transcription in batch:
-            audio, translation, transcription = self.process_utterance(
-                audio, translation, transcription)
-            #utt
-            utts.append(utt)
-            # audio
-            audios.append(audio)  # [T, D]
-            audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
-            tokens = [[], []]
-            for idx, text in enumerate([translation, transcription]):
-                if self._keep_transcription_text:
-                    assert isinstance(text, str), (type(text), text)
-                    tokens[idx] = [ord(t) for t in text]
-                else:
-                    tokens[idx] = text  # token ids
-                tokens[idx] = tokens[idx] if isinstance(
-                    tokens[idx], np.ndarray) else np.array(
-                        tokens[idx], dtype=np.int64)
-            translation_text.append(tokens[0])
-            translation_text_lens.append(tokens[0].shape[0])
-            transcription_text.append(tokens[1])
-            transcription_text_lens.append(tokens[1].shape[0])
-
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_translation = pad_sequence(
-            translation_text, padding_value=IGNORE_ID).astype(np.int64)
-        translation_lens = np.array(translation_text_lens).astype(np.int64)
-        padded_transcription = pad_sequence(
-            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
-        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, (
-            padded_translation, padded_transcription), (translation_lens,
-                                                        transcription_lens)
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log

-__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
+__all__ = ["ManifestDataset", "TransformDataset"]

 logger = Log(__name__).getlog()

@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
        return len(self._manifest)

    def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"]
-
-
-class TripletManifestDataset(ManifestDataset):
-    """
-        For Joint Training of Speech Translation and ASR.
-        text: translation,
-        text1: transcript.
-    """
-
-    def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"], instance[
-            "text1"]
+        return self._manifest[idx]


 class TransformDataset(Dataset):
@ -273,5 +259,4 @@ class AudioDataset(Dataset):
        return len(self.minibatch)

    def __getitem__(self, idx):
-        instance = self.minibatch[idx]
-        return instance["utt"], instance["feat"], instance["text"]
+        return self.minibatch[idx]
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@ -321,6 +321,22 @@ class LoadInputsAndTargets():
            raise NotImplementedError(
                "Not supported: loader_type={}".format(filetype))

+    def file_type(self, filepath):
+        suffix = filepath.split(":")[0].split('.')[-1]
+        if suffix == 'ark':
+            return 'mat'
+        elif suffix == 'scp':
+            return 'scp'
+        elif suffix == 'npy':
+            return 'npy'
+        elif suffix == 'npz':
+            return 'npz'
+        elif suffix in ['wav', 'flac']:
+            # PCM16
+            return 'sound'
+        else:
+            raise ValueError(f"Not support filetype: {suffix}")
+

 class SoundHDF5File():
    """Collecting sound files to a HDF5 file
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -49,7 +49,7 @@ class CTCDecoder(nn.Layer):
            dropout_rate (float): dropout rate (0.0 ~ 1.0)
            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
            batch_average (bool): do batch dim wise average.
-            grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
+            grad_norm_type (str): one of 'instance', 'batch', 'frame', None.
        """
        assert check_argument_types()
        super().__init__()
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -49,6 +49,8 @@ class CTCLoss(nn.Layer):
            self.norm_by_batchsize = True
        elif grad_norm_type == 'frame':
            self.norm_by_total_logits_len = True
+        else:
+            raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}")

    def forward(self, logits, ys_pad, hlens, ys_lens):
        """Compute CTC loss.
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -263,6 +263,7 @@ class Trainer():
                            msg += f"{v:>.8f}" if isinstance(v,
                                                             float) else f"{v}"
                            msg += ","
+                        msg = msg[:-1]  # remove the last ","
                        logger.info(msg)
                        data_start_time = time.time()
                except Exception as e:
--- a/docs/src/data_preparation.md
+++ b/docs/src/data_preparation.md
@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
 ```bash
 python3 utils/compute_mean_std.py \
 --num_samples 2000 \
--specgram_type linear \
+--spectrum_type linear \
 --manifest_path examples/librispeech/data/manifest.train \
 --output_path examples/librispeech/data/mean_std.npz
 ```
--- a/docs/src/deepspeech_architecture.md
+++ b/docs/src/deepspeech_architecture.md
@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
 cd examples/aishell/s0
 python3 ../../../utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-     --specgram_type="linear" \
+     --spectrum_type="linear" \
     --delta_delta=false \
     --stride_ms=10.0 \
     --window_ms=20.0 \
--- a/docs/src/released_model.md
+++ b/docs/src/released_model.md
@ -1,21 +1,21 @@
 # Released Models

 ## Acoustic Model Released in paddle 2.X
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
-[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 | 151 h
-[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 | 151 h
-[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 | 151 h
-[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 | 151 h
-[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0325 | 960 h
-[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0544 | 960 h
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
+:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
+[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
+[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
+[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
+[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
+[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
+[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h

 ## Acoustic Model Transformed from paddle 1.8
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
-[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 | 151 h|
-[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers | 0.0685| 960 h|
-[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers | 0.0541 | 8628 h|
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
+:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
+[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
+[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
+[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|



--- a/examples/1xt2x/aishell/conf/deepspeech2.yaml
+++ b/examples/1xt2x/aishell/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/1xt2x/librispeech/conf/deepspeech2.yaml
+++ b/examples/1xt2x/librispeech/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear #linear, mfcc, fbank
+  spectrum_type: linear #linear, mfcc, fbank
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
    --delta_delta=false \
    --stride_ms=10.0 \
    --window_ms=20.0 \
--- a/examples/aishell/s0/local/test_hub.sh
+++ b/examples/aishell/s0/local/test_hub.sh
@ -0,0 +1,36 @@
+#!/bin/bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+model_type=$3
+audio_file=$4
+
+# download language model
+bash local/download_lm_ch.sh
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+python3 -u ${BIN_DIR}/test_hub.py \
+--nproc ${ngpu} \
+--config ${config_path} \
+--result_file ${ckpt_prefix}.rsl \
+--checkpoint_path ${ckpt_prefix} \
+--model_type ${model_type} \
+--audio_file ${audio_file}
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
@ -15,6 +15,8 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"

+audio_file="data/tmp.wav"
+
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
@ -44,3 +46,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test export ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
+
+# Optionally, you can add LM and test it with runtime.
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
+fi
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --stride_ms=10.0 \
--- a/examples/aishell/s1/local/test.sh
+++ b/examples/aishell/s1/local/test.sh
@ -38,7 +38,8 @@ for type in attention ctc_greedy_search; do
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -56,7 +57,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --stride_ms=10.0 \
--- a/examples/callcenter/s1/local/test.sh
+++ b/examples/callcenter/s1/local/test.sh
@ -32,7 +32,8 @@ for type in attention ctc_greedy_search; do
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -50,7 +51,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=2000 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
    --stride_ms=10.0 \
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
@ -18,7 +18,7 @@ collator:
  # augmentation_config: conf/augmentation.json
  batch_size: 10
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
@ -18,7 +18,7 @@ collator:
  # augmentation_config: conf/augmentation.json
  batch_size: 10
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@ -17,7 +17,7 @@ collator:
  augmentation_config: ""
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.tiny.raw" \
    --num_samples=64 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
    --stride_ms=10.0 \
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.tiny.raw" \
    --num_samples=64 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/tiny/s1/local/test.sh
+++ b/examples/tiny/s1/local/test.sh
@ -35,7 +35,8 @@ for type in attention ctc_greedy_search; do
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -51,7 +52,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
@ -30,12 +30,12 @@ fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES= ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES= ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--- a/hub/requirements.txt
+++ b/hub/requirements.txt
@ -0,0 +1,26 @@
+coverage
+gpustat
+jsonlines
+kaldiio
+llvmlite==0.31.0
+loguru
+numba==0.47.0
+numpy==1.18.5
+Pillow
+pre-commit
+pybind11
+python-speech-features
+resampy==0.2.2
+sacrebleu
+scipy==1.2.1
+sentencepiece
+snakeviz
+SoundFile==0.9.0.post1
+sox
+soxbindings
+tensorboardX
+textgrid
+tqdm
+typeguard
+visualdl==2.2.0
+yacs
--- a/hub/setup_hub.sh
+++ b/hub/setup_hub.sh
@ -0,0 +1,66 @@
+#! /usr/bin/env  bash
+cd .. >> /dev/null
+source utils/log.sh
+
+
+SUDO='sudo'
+if [ $(id -u) -eq 0 ]; then
+  SUDO=''
+fi
+
+if [ -e /etc/lsb-release ];then
+    ${SUDO} apt-get update -y
+    ${SUDO} apt-get install -y jq vim tig tree sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
+    if [ $? != 0 ]; then
+        error_msg "Please using Ubuntu or install pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev by user."
+        exit -1
+    fi
+fi
+
+
+source tools/venv/bin/activate
+
+cd -
+#install python dependencies
+if [ -f "requirements.txt" ]; then
+    pip3 install -r requirements.txt
+fi
+if [ $? != 0 ]; then
+    error_msg "Install python dependencies failed !!!"
+    exit 1
+fi
+cd .. >> /dev/null
+
+# install package libsndfile
+python3 -c "import soundfile"
+if [ $? != 0 ]; then
+    info_msg "Install package libsndfile into default system path."
+    wget "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
+    if [ $? != 0 ]; then
+        error_msg "Download libsndfile-1.0.28.tar.gz failed !!!"
+        exit 1
+    fi
+    tar -zxvf libsndfile-1.0.28.tar.gz
+    cd libsndfile-1.0.28
+    ./configure > /dev/null && make > /dev/null && make install > /dev/null
+    cd ..
+    rm -rf libsndfile-1.0.28
+    rm libsndfile-1.0.28.tar.gz
+fi
+
+
+# install decoders
+python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")"
+if [ $? != 0 ]; then
+    cd deepspeech/decoders/swig > /dev/null
+    sh setup.sh
+    cd - > /dev/null
+fi
+python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")"
+if [ $? != 0 ]; then
+   error_msg "Please check why decoder install error!"
+   exit -1
+fi
+
+
+info_msg "Install all dependencies successfully."
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")

-add_arg('specgram_type',    str,
+add_arg('spectrum_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc, fbank.",
        choices=['linear', 'mfcc', 'fbank'])
@ -58,7 +58,7 @@ def main():

    augmentation_pipeline = AugmentationPipeline('{}')
    audio_featurizer = AudioFeaturizer(
-        specgram_type=args.specgram_type,
+        spectrum_type=args.spectrum_type,
        feat_dim=args.feat_dim,
        delta_delta=args.delta_delta,
        stride_ms=args.stride_ms,
--- a/utils/format_data.py
+++ b/utils/format_data.py
@ -26,7 +26,7 @@ from deepspeech.utils.utility import print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi")
+add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp")
 add_arg('cmvn_path',       str,
        'examples/librispeech/data/mean_std.json',
        "Filepath of cmvn.")
@ -76,6 +76,7 @@ def main():
            assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
            if args.feat_type == 'raw':
                feat_shape.append(feat_dim)
+                line_json['filetype'] = 'sound'
            else: # kaldi
                raise NotImplementedError('no support kaldi feat now!')
            fout.write(json.dumps(line_json) + '\n')