Merge branch 'develop' of https://github.com/LittleChenCc/DeepSpeech into develop

3 years ago · 46df01151f
parent a0c94209e2 74e99c15ab
commit 46df01151f
66 changed files with 788 additions and 1018 deletions
--- a/deepspeech/exps/deepspeech2/bin/test_hub.py
+++ b/deepspeech/exps/deepspeech2/bin/test_hub.py
@ -0,0 +1,191 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Evaluation for DeepSpeech2 model."""
 import os
 import sys
 from pathlib import Path
 import paddle
 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
 from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils import mp_tools
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import print_arguments
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
 class DeepSpeech2Tester_hub():
    def __init__(self, config, args):
        self.args = args
        self.config = config
        self.audio_file = args.audio_file
        self.collate_fn_test = SpeechCollator.from_config(config)
        self._text_featurizer = TextFeaturizer(
            unit_type=config.collator.unit_type, vocab_filepath=None)
    def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
        result_transcripts = self.model.decode(
            audio,
            audio_len,
            vocab_list,
            decoding_method=cfg.decoding_method,
            lang_model_path=cfg.lang_model_path,
            beam_alpha=cfg.alpha,
            beam_beta=cfg.beta,
            beam_size=cfg.beam_size,
            cutoff_prob=cfg.cutoff_prob,
            cutoff_top_n=cfg.cutoff_top_n,
            num_processes=cfg.num_proc_bsearch)
        #replace the '<space>' with ' '
        result_transcripts = [
            self._text_featurizer.detokenize(sentence)
            for sentence in result_transcripts
        ]
        return result_transcripts
    @mp_tools.rank_zero_only
    @paddle.no_grad()
    def test(self):
        self.model.eval()
        cfg = self.config
        audio_file = self.audio_file
        collate_fn_test = self.collate_fn_test
        audio, _ = collate_fn_test.process_utterance(
            audio_file=audio_file, transcript=" ")
        audio_len = audio.shape[0]
        audio = paddle.to_tensor(audio, dtype='float32')
        audio_len = paddle.to_tensor(audio_len)
        audio = paddle.unsqueeze(audio, axis=0)
        vocab_list = collate_fn_test.vocab_list
        result_transcripts = self.compute_result_transcripts(
            audio, audio_len, vocab_list, cfg.decoding)
        logger.info("result_transcripts: " + result_transcripts[0])
    def run_test(self):
        self.resume()
        try:
            self.test()
        except KeyboardInterrupt:
            exit(-1)
    def setup(self):
        """Setup the experiment.
        """
        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
        self.setup_model()
    def setup_output_dir(self):
        """Create a directory used for output.
        """
        # output dir
        if self.args.output:
            output_dir = Path(self.args.output).expanduser()
            output_dir.mkdir(parents=True, exist_ok=True)
        else:
            output_dir = Path(
                self.args.checkpoint_path).expanduser().parent.parent
            output_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir = output_dir
    def setup_model(self):
        config = self.config.clone()
        with UpdateConfig(config):
            config.model.feat_size = self.collate_fn_test.feature_size
            config.model.dict_size = self.collate_fn_test.vocab_size
        if self.args.model_type == 'offline':
            model = DeepSpeech2Model.from_config(config.model)
        elif self.args.model_type == 'online':
            model = DeepSpeech2ModelOnline.from_config(config.model)
        else:
            raise Exception("wrong model type")
        self.model = model
    def setup_checkpointer(self):
        """Create a directory used to save checkpoints into.
        It is "checkpoints" inside the output directory.
        """
        # checkpoint dir
        checkpoint_dir = self.output_dir / "checkpoints"
        checkpoint_dir.mkdir(exist_ok=True)
        self.checkpoint_dir = checkpoint_dir
        self.checkpoint = Checkpoint(
            kbest_n=self.config.training.checkpoint.kbest_n,
            latest_n=self.config.training.checkpoint.latest_n)
    def resume(self):
        """Resume from the checkpoint at checkpoints in the output
        directory or load a specified checkpoint.
        """
        params_path = self.args.checkpoint_path + ".pdparams"
        model_dict = paddle.load(params_path)
        self.model.set_state_dict(model_dict)
 def main_sp(config, args):
    exp = DeepSpeech2Tester_hub(config, args)
    exp.setup()
    exp.run_test()
 def main(config, args):
    main_sp(config, args)
 if __name__ == "__main__":
    parser = default_argument_parser()
    parser.add_argument("--model_type")
    parser.add_argument("--audio_file")
    # save asr result to
    parser.add_argument(
        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
    if args.model_type is None:
        args.model_type = 'offline'
    if not os.path.isfile(args.audio_file):
        print("Please input the audio file path")
        sys.exit(-1)
    print("model_type:{}".format(args.model_type))
    # https://yaml.org/type/float.html
    config = get_cfg_defaults(args.model_type)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()
    print(config)
    if args.dump_config:
        with open(args.dump_config, 'w') as f:
            print(config, file=f)
    main(config, args)
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -235,16 +235,18 @@ class DeepSpeech2Trainer(Trainer):
            num_workers=config.collator.num_workers)
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=int(config.collator.batch_size / 4),
+            batch_size=int(config.collator.batch_size),
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
            num_workers=config.collator.num_workers)
        self.test_loader = DataLoader(
            test_dataset,
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn_test)
+            collate_fn=collate_fn_test,
            num_workers=config.collator.num_workers)
        logger.info("Setup train/valid/test  Dataloader!")
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -216,6 +216,7 @@ class U2Trainer(Trainer):
                            msg += f"{v:>.8f}" if isinstance(v,
                                                             float) else f"{v}"
                            msg += ","
                        msg = msg[:-1]  # remove the last ","
                        if (batch_index + 1
                            ) % self.config.training.log_interval == 0:
                            logger.info(msg)
@ -243,6 +244,7 @@ class U2Trainer(Trainer):
                self.visualizer.add_scalars(
                    'epoch', {'cv_loss': cv_loss,
                              'lr': self.lr_scheduler()}, self.epoch)
            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
            self.new_epoch()
@ -291,7 +293,8 @@ class U2Trainer(Trainer):
            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
            num_workers=config.collator.num_workers, )
        # test dataset, return raw text
        config.data.manifest = config.data.test_manifest
@ -313,7 +316,8 @@ class U2Trainer(Trainer):
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=SpeechCollator.from_config(config))
+            collate_fn=SpeechCollator.from_config(config),
            num_workers=config.collator.num_workers, )
        # return text token id
        config.collator.keep_transcription_text = False
        self.align_loader = DataLoader(
@ -321,7 +325,8 @@ class U2Trainer(Trainer):
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=SpeechCollator.from_config(config))
+            collate_fn=SpeechCollator.from_config(config),
            num_workers=config.collator.num_workers, )
        logger.info("Setup train/valid/test/align Dataloader!")
    def setup_model(self):
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@ -28,12 +28,9 @@ from paddle import distributed as dist
 from paddle.io import DataLoader
 from yacs.config import CfgNode
-from deepspeech.io.collator_st import KaldiPrePorocessedCollator
+from deepspeech.io.collator import SpeechCollator
-from deepspeech.io.collator_st import SpeechCollator
+from deepspeech.io.collator import TripletSpeechCollator
 from deepspeech.io.collator_st import TripletKaldiPrePorocessedCollator
 from deepspeech.io.collator_st import TripletSpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.io.dataset import TripletManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2_st import U2STModel
@ -251,29 +248,19 @@ class U2STTrainer(Trainer):
        config.collator.keep_transcription_text = False
        # train/valid dataset, return token ids
        Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset
        config.data.manifest = config.data.train_manifest
-        train_dataset = Dataset.from_config(config)
+        train_dataset = ManifestDataset.from_config(config)
        config.data.manifest = config.data.dev_manifest
-        dev_dataset = Dataset.from_config(config)
+        dev_dataset = ManifestDataset.from_config(config)
-        if config.collator.raw_wav:
+        if config.model.model_conf.asr_weight > 0.:
-            if config.model.model_conf.asr_weight > 0.:
+            Collator = TripletSpeechCollator
-                Collator = TripletSpeechCollator
+            TestCollator = SpeechCollator
                TestCollator = SpeechCollator
            else:
                TestCollator = Collator = SpeechCollator
            # Not yet implement the mtl loader for raw_wav.
        else:
-            if config.model.model_conf.asr_weight > 0.:
+            TestCollator = Collator = SpeechCollator
                Collator = TripletKaldiPrePorocessedCollator
                TestCollator = KaldiPrePorocessedCollator
            else:
                TestCollator = Collator = KaldiPrePorocessedCollator
        collate_fn_train = Collator.from_config(config)
        config.collator.augmentation_config = ""
        collate_fn_dev = Collator.from_config(config)
@ -305,7 +292,8 @@ class U2STTrainer(Trainer):
            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn_dev)
+            collate_fn=collate_fn_dev,
            num_workers=config.collator.num_workers, )
        # test dataset, return raw text
        config.data.manifest = config.data.test_manifest
@ -326,7 +314,8 @@ class U2STTrainer(Trainer):
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=TestCollator.from_config(config))
+            collate_fn=TestCollator.from_config(config),
            num_workers=config.collator.num_workers, )
        # return text token id
        config.collator.keep_transcription_text = False
        self.align_loader = DataLoader(
@ -334,7 +323,8 @@ class U2STTrainer(Trainer):
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=TestCollator.from_config(config))
+            collate_fn=TestCollator.from_config(config),
            num_workers=config.collator.num_workers, )
        logger.info("Setup train/valid/test/align Dataloader!")
    def setup_model(self):
--- a/deepspeech/frontend/audio.py
+++ b/deepspeech/frontend/audio.py
@ -24,8 +24,10 @@ import soundfile
 import soxbindings as sox
 from scipy import signal
 from .utility import subfile_from_tar
-class AudioSegment(object):
+
 class AudioSegment():
    """Monaural audio segment abstraction.
    :param samples: Audio samples [num_samples x num_channels].
@ -68,16 +70,20 @@ class AudioSegment(object):
                                self.duration, self.rms_db))
    @classmethod
-    def from_file(cls, file):
+    def from_file(cls, file, infos=None):
        """Create audio segment from audio file.
-        
+
-        :param filepath: Filepath or file object to audio file.
+        Args:
-        :type filepath: str|file
+            filepath (str|file): Filepath or file object to audio file.
-        :return: Audio segment instance.
+            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
-        :rtype: AudioSegment
+
        Returns:
            AudioSegment: Audio segment instance.
        """
        if isinstance(file, str) and re.findall(r".seqbin_\d+$", file):
            return cls.from_sequence_file(file)
        elif isinstance(file, str) and file.startswith('tar:'):
            return cls.from_file(subfile_from_tar(file, infos))
        else:
            samples, sample_rate = soundfile.read(file, dtype='float32')
            return cls(samples, sample_rate)
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@ -24,15 +24,15 @@ class AudioFeaturizer():
    Currently, it supports feature types of linear spectrogram and mfcc.
-    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :param spectrum_type: Specgram feature type. Options: 'linear'.
-    :type specgram_type: str
+    :type spectrum_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
+    :param max_freq: When spectrum_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_feq is the
+                     returned; when spectrum_type is 'mfcc', max_feq is the
                     highest band edge of mel filters.
    :types max_freq: None|float
    :param target_sample_rate: Audio are resampled (if upsampling or
@ -47,7 +47,7 @@ class AudioFeaturizer():
    """
    def __init__(self,
-                 specgram_type: str='linear',
+                 spectrum_type: str='linear',
                 feat_dim: int=None,
                 delta_delta: bool=False,
                 stride_ms=10.0,
@ -58,7 +58,7 @@ class AudioFeaturizer():
                 use_dB_normalization=True,
                 target_dB=-20,
                 dither=1.0):
-        self._specgram_type = specgram_type
+        self._spectrum_type = spectrum_type
        # mfcc and fbank using `feat_dim`
        self._feat_dim = feat_dim
        # mfcc and fbank using `delta-delta`
@ -113,27 +113,27 @@ class AudioFeaturizer():
    def feature_size(self):
        """audio feature size"""
        feat_dim = 0
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
            fft_point = self._window_ms if self._fft_point is None else self._fft_point
            feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
                           1)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
            # mfcc, delta, delta-delta
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
            # fbank, delta, delta-delta
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
        else:
-            raise ValueError("Unknown specgram_type %s. "
+            raise ValueError("Unknown spectrum_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+                             "Supported values: linear." % self._spectrum_type)
        return feat_dim
    def _compute_specgram(self, audio_segment):
        """Extract various audio features."""
        sample_rate = audio_segment.sample_rate
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
            samples = audio_segment.samples
            return self._compute_linear_specgram(
                samples,
@ -141,7 +141,7 @@ class AudioFeaturizer():
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
            samples = audio_segment.to('int16')
            return self._compute_mfcc(
                samples,
@ -152,7 +152,7 @@ class AudioFeaturizer():
                max_freq=self._max_freq,
                dither=self._dither,
                delta_delta=self._delta_delta)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
            samples = audio_segment.to('int16')
            return self._compute_fbank(
                samples,
@ -164,8 +164,8 @@ class AudioFeaturizer():
                dither=self._dither,
                delta_delta=self._delta_delta)
        else:
-            raise ValueError("Unknown specgram_type %s. "
+            raise ValueError("Unknown spectrum_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+                             "Supported values: linear." % self._spectrum_type)
    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
        """Compute the spectrogram for samples from a real signal."""
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 class SpeechFeaturizer():
-    """Speech featurizer, for extracting features from both audio and transcript
+    """Speech and Text feature extraction.
    contents of SpeechSegment.
    Currently, for audio parts, it supports feature types of linear
    spectrogram and mfcc; for transcript parts, it only supports char-level
    tokenizing and conversion into a list of token indices. Note that the
    token indexing order follows the given vocabulary file.
    :param vocab_filepath: Filepath to load vocabulary for token indices
                           conversion.
    :type specgram_type: str
    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
    :type specgram_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: When specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned; when specgram_type is 'mfcc', max_freq is the
                     highest band edge of mel filters.
    :types max_freq: None|float
    :param target_sample_rate: Speech are resampled (if upsampling or
                               downsampling is allowed) to this before
                               extracting spectrogram features.
    :type target_sample_rate: float
    :param use_dB_normalization: Whether to normalize the audio to a certain
                                 decibels before extracting the features.
    :type use_dB_normalization: bool
    :param target_dB: Target audio decibels for normalization.
    :type target_dB: float
    """
    def __init__(self,
                 unit_type,
                 vocab_filepath,
                 spm_model_prefix=None,
-                 specgram_type='linear',
+                 spectrum_type='linear',
                 feat_dim=None,
                 delta_delta=False,
                 stride_ms=10.0,
@ -64,9 +34,13 @@ class SpeechFeaturizer():
                 target_sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20,
-                 dither=1.0):
+                 dither=1.0,
-        self._audio_featurizer = AudioFeaturizer(
+                 maskctc=False):
-            specgram_type=specgram_type,
+        self.stride_ms = stride_ms
        self.window_ms = window_ms
        self.audio_feature = AudioFeaturizer(
            spectrum_type=spectrum_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
            stride_ms=stride_ms,
@ -77,8 +51,12 @@ class SpeechFeaturizer():
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB,
            dither=dither)
-        self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath,
+
-                                               spm_model_prefix)
+        self.text_feature = TextFeaturizer(
            unit_type=unit_type,
            vocab_filepath=vocab_filepath,
            spm_model_prefix=spm_model_prefix,
            maskctc=maskctc)
    def featurize(self, speech_segment, keep_transcription_text):
        """Extract features for speech segment.
@ -94,60 +72,33 @@ class SpeechFeaturizer():
        Returns:
            tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
        """
-        spec_feature = self._audio_featurizer.featurize(speech_segment)
+        spec_feature = self.audio_feature.featurize(speech_segment)
        if keep_transcription_text:
            return spec_feature, speech_segment.transcript
        if speech_segment.has_token:
            text_ids = speech_segment.token_ids
        else:
-            text_ids = self._text_featurizer.featurize(
+            text_ids = self.text_feature.featurize(speech_segment.transcript)
                speech_segment.transcript)
        return spec_feature, text_ids
-    @property
+    def text_featurize(self, text, keep_transcription_text):
-    def vocab_size(self):
+        """Extract features for speech segment.
        """Return the vocabulary size.
        Returns:
            int: Vocabulary size.
        """
        return self._text_featurizer.vocab_size
    @property
    def vocab_list(self):
        """Return the vocabulary in list.
        Returns:
            List[str]: 
        """
        return self._text_featurizer.vocab_list
-    @property
+        1. For audio parts, extract the audio features.
-    def vocab_dict(self):
+        2. For transcript parts, keep the original text or convert text string
-        """Return the vocabulary in dict.
+           to a list of token indices in char-level.
        Returns:
            Dict[str, int]: 
        """
        return self._text_featurizer.vocab_dict
-    @property
+        Args:
-    def feature_size(self):
+            text (str): text.
-        """Return the audio feature size.
+            keep_transcription_text (bool): True, keep transcript text, False, token ids
        Returns:
            int: audio feature size.
        """
        return self._audio_featurizer.feature_size
    @property
    def stride_ms(self):
        """time length in `ms` unit per frame
        Returns:
-            float: time(ms)/frame
+            (str|List[int]): text, or list of token indices.
        """
-        return self._audio_featurizer.stride_ms
+        if keep_transcription_text:
            return text
-    @property
+        text_ids = self.text_feature.featurize(text)
-    def text_feature(self):
+        return text_ids
        """Return the text feature object.
        Returns:
            TextFeaturizer: object.
        """
        return self._text_featurizer
--- a/deepspeech/frontend/speech.py
+++ b/deepspeech/frontend/speech.py
@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment):
        return not self.__eq__(other)
    @classmethod
-    def from_file(cls, filepath, transcript, tokens=None, token_ids=None):
+    def from_file(cls,
                  filepath,
                  transcript,
                  tokens=None,
                  token_ids=None,
                  infos=None):
        """Create speech segment from audio file and corresponding transcript.
        Args:
@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment):
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.
            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
        Returns:
            SpeechSegment: Speech segment instance.
        """
-
+        audio = AudioSegment.from_file(filepath, infos)
        audio = AudioSegment.from_file(filepath)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@ -14,6 +14,8 @@
 """Contains data helper functions."""
 import json
 import math
 import tarfile
 from collections import namedtuple
 from typing import List
 from typing import Optional
 from typing import Text
@ -112,6 +114,51 @@ def read_manifest(
    return manifest
 # Tar File read
 TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
 def parse_tar(file):
    """Parse a tar file to get a tarfile object
    and a map containing tarinfoes
    """
    result = {}
    f = tarfile.open(file)
    for tarinfo in f.getmembers():
        result[tarinfo.name] = tarinfo
    return f, result
 def subfile_from_tar(file, local_data=None):
    """Get subfile object from tar.
    tar:tarpath#filename
    It will return a subfile object from tar file
    and cached tar file info for next reading request.
    """
    tarpath, filename = file.split(':', 1)[1].split('#', 1)
    if local_data is None:
        local_data = TarLocalData(tar2info={}, tar2object={})
    assert isinstance(local_data, TarLocalData)
    if 'tar2info' not in local_data.__dict__:
        local_data.tar2info = {}
    if 'tar2object' not in local_data.__dict__:
        local_data.tar2object = {}
    if tarpath not in local_data.tar2info:
        fobj, infos = parse_tar(tarpath)
        local_data.tar2info[tarpath] = infos
        local_data.tar2object[tarpath] = fobj
    else:
        fobj = local_data.tar2object[tarpath]
        infos = local_data.tar2info[tarpath]
    return fobj.extractfile(infos[filename])
 def rms_to_db(rms: float):
    """Root Mean Square to dB.
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
 from collections import namedtuple
 from typing import Optional
 import numpy as np
@ -23,96 +22,30 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.frontend.utility import TarLocalData
 from deepspeech.io.reader import LoadInputsAndTargets
 from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log
-__all__ = ["SpeechCollator"]
+__all__ = ["SpeechCollator", "TripletSpeechCollator"]
 logger = Log(__name__).getlog()
 # namedtupe need global for pickle.
 TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
 def tokenids(text, keep_transcription_text):
    # for training text is token ids 
    tokens = text  # token ids
-class SpeechCollator():
+    if keep_transcription_text:
-    @classmethod
+        # text is string, convert to unicode ord
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        assert isinstance(text, str), (type(text), text)
-        default = CfgNode(
+        tokens = [ord(t) for t in text]
            dict(
                augmentation_config="",
                random_seed=0,
                mean_std_filepath="",
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
                feat_dim=0,  # 'mfcc', 'fbank'
                delta_delta=False,  # 'mfcc', 'fbank'
                stride_ms=10.0,  # ms
                window_ms=20.0,  # ms
                n_fft=None,  # fft points
                max_freq=None,  # None for samplerate/2
                target_sample_rate=16000,  # target sample rate
                use_dB_normalization=True,
                target_dB=-20,
                dither=1.0,  # feature dither
                keep_transcription_text=False))
-        if config is not None:
+    tokens = np.array(tokens, dtype=np.int64)
-            config.merge_from_other_cfg(default)
+    return tokens
        return default
    @classmethod
    def from_config(cls, config):
        """Build a SpeechCollator object from a config.
        Args:
            config (yacs.config.CfgNode): configs object.
        Returns:
            SpeechCollator: collator object.
        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
        assert 'mean_std_filepath' in config.collator
        assert 'vocab_filepath' in config.collator
        assert 'specgram_type' in config.collator
        assert 'n_fft' in config.collator
        assert config.collator
        if isinstance(config.collator.augmentation_config, (str, bytes)):
            if config.collator.augmentation_config:
                aug_file = io.open(
                    config.collator.augmentation_config,
                    mode='r',
                    encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.collator.augmentation_config
            assert isinstance(aug_file, io.StringIO)
        speech_collator = cls(
            aug_file=aug_file,
            random_seed=0,
            mean_std_filepath=config.collator.mean_std_filepath,
            unit_type=config.collator.unit_type,
            vocab_filepath=config.collator.vocab_filepath,
            spm_model_prefix=config.collator.spm_model_prefix,
            specgram_type=config.collator.specgram_type,
            feat_dim=config.collator.feat_dim,
            delta_delta=config.collator.delta_delta,
            stride_ms=config.collator.stride_ms,
            window_ms=config.collator.window_ms,
            n_fft=config.collator.n_fft,
            max_freq=config.collator.max_freq,
            target_sample_rate=config.collator.target_sample_rate,
            use_dB_normalization=config.collator.use_dB_normalization,
            target_dB=config.collator.target_dB,
            dither=config.collator.dither,
            keep_transcription_text=config.collator.keep_transcription_text)
        return speech_collator
 class SpeechCollatorBase():
    def __init__(
            self,
            aug_file,
@ -121,7 +54,7 @@ class SpeechCollator():
            spm_model_prefix,
            random_seed=0,
            unit_type="char",
-            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+            spectrum_type='linear',  # 'linear', 'mfcc', 'fbank'
            feat_dim=0,  # 'mfcc', 'fbank'
            delta_delta=False,  # 'mfcc', 'fbank'
            stride_ms=10.0,  # ms
@ -146,7 +79,7 @@ class SpeechCollator():
            n_fft (int, optional): fft points for rfft. Defaults to None.
            max_freq (int, optional): max cut freq. Defaults to None.
            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
-            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
+            spectrum_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
@ -159,23 +92,27 @@ class SpeechCollator():
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one batch.
        """
-        self._keep_transcription_text = keep_transcription_text
+        self.keep_transcription_text = keep_transcription_text
        self.stride_ms = stride_ms
        self.window_ms = window_ms
        self.feat_dim = feat_dim
        self.loader = LoadInputsAndTargets()
        # only for tar filetype
        self._local_data = TarLocalData(tar2info={}, tar2object={})
-        self._augmentation_pipeline = AugmentationPipeline(
+
        self.augmentation = AugmentationPipeline(
            augmentation_config=aug_file.read(), random_seed=random_seed)
        self._normalizer = FeatureNormalizer(
            mean_std_filepath) if mean_std_filepath else None
        self._stride_ms = stride_ms
        self._target_sample_rate = target_sample_rate
        self._speech_featurizer = SpeechFeaturizer(
            unit_type=unit_type,
            vocab_filepath=vocab_filepath,
            spm_model_prefix=spm_model_prefix,
-            specgram_type=specgram_type,
+            spectrum_type=spectrum_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
            stride_ms=stride_ms,
@ -187,33 +124,11 @@ class SpeechCollator():
            target_dB=target_dB,
            dither=dither)
-    def _parse_tar(self, file):
+        self.feature_size = self._speech_featurizer.audio_feature.feature_size
-        """Parse a tar file to get a tarfile object
+        self.text_feature = self._speech_featurizer.text_feature
-        and a map containing tarinfoes
+        self.vocab_dict = self.text_feature.vocab_dict
-        """
+        self.vocab_list = self.text_feature.vocab_list
-        result = {}
+        self.vocab_size = self.text_feature.vocab_size
        f = tarfile.open(file)
        for tarinfo in f.getmembers():
            result[tarinfo.name] = tarinfo
        return f, result
    def _subfile_from_tar(self, file):
        """Get subfile object from tar.
        It will return a subfile object from tar file
        and cached tar file info for next reading request.
        """
        tarpath, filename = file.split(':', 1)[1].split('#', 1)
        if 'tar2info' not in self._local_data.__dict__:
            self._local_data.tar2info = {}
        if 'tar2object' not in self._local_data.__dict__:
            self._local_data.tar2object = {}
        if tarpath not in self._local_data.tar2info:
            object, infoes = self._parse_tar(tarpath)
            self._local_data.tar2info[tarpath] = infoes
            self._local_data.tar2object[tarpath] = object
        return self._local_data.tar2object[tarpath].extractfile(
            self._local_data.tar2info[tarpath][filename])
    def process_utterance(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.
@ -226,62 +141,69 @@ class SpeechCollator():
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
+        filetype = self.loader.file_type(audio_file)
-            speech_segment = SpeechSegment.from_file(
+
-                self._subfile_from_tar(audio_file), transcript)
+        if filetype != 'sound':
-        else:
+            spectrum = self.loader._get_from_loader(audio_file, filetype)
-            speech_segment = SpeechSegment.from_file(audio_file, transcript)
+            feat_dim = spectrum.shape[1]
            assert feat_dim == self.feat_dim, f"expect feat dim {self.feat_dim}, but got {feat_dim}"
-        # audio augment
+            if self.keep_transcription_text:
-        self._augmentation_pipeline.transform_audio(speech_segment)
+                transcript_part = transcript
            else:
                text_ids = self.text_feature.featurize(transcript)
                transcript_part = text_ids
        else:
            # read audio
            speech_segment = SpeechSegment.from_file(
                audio_file, transcript, infos=self._local_data)
            # audio augment
            self.augmentation.transform_audio(speech_segment)
-        specgram, transcript_part = self._speech_featurizer.featurize(
+            # extract speech feature
-            speech_segment, self._keep_transcription_text)
+            spectrum, transcript_part = self._speech_featurizer.featurize(
-        if self._normalizer:
+                speech_segment, self.keep_transcription_text)
-            specgram = self._normalizer.apply(specgram)
+            # CMVN spectrum
            if self._normalizer:
                spectrum = self._normalizer.apply(spectrum)
-        # specgram augment
+        # spectrum augment
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
+        spectrum = self.augmentation.transform_feature(spectrum)
-        return specgram, transcript_part
+        return spectrum, transcript_part
    def __call__(self, batch):
        """batch examples
        Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)
        Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
-                audio : (B, Tmax, D)
+                utts: (B,)
-                audio_lens: (B)
+                xs_pad : (B, Tmax, D)
-                text : (B, Umax)
+                ilens: (B,)
-                text_lens: (B)
+                ys_pad : (B, Umax)
                olens: (B,)
        """
        audios = []
        audio_lens = []
        texts = []
        text_lens = []
        utts = []
-        for utt, audio, text in batch:
+
        for idx, item in enumerate(batch):
            utts.append(item['utt'])
            audio = item['feat']
            text = item['text']
            audio, text = self.process_utterance(audio, text)
-            #utt
+
            utts.append(utt)
            # audio
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
-            # text
+
-            # for training, text is token ids
+            tokens = tokenids(text, self.keep_transcription_text)
            # else text is string, convert to unicode ord
            tokens = []
            if self._keep_transcription_text:
                assert isinstance(text, str), (type(text), text)
                tokens = [ord(t) for t in text]
            else:
                tokens = text  # token ids
            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
                tokens, dtype=np.int64)
            texts.append(tokens)
            text_lens.append(tokens.shape[0])
@ -292,26 +214,161 @@ class SpeechCollator():
        olens = np.array(text_lens).astype(np.int64)
        return utts, xs_pad, ilens, ys_pad, olens
    @property
    def vocab_size(self):
        return self._speech_featurizer.vocab_size
-    @property
+class SpeechCollator(SpeechCollatorBase):
-    def vocab_list(self):
+    @classmethod
-        return self._speech_featurizer.vocab_list
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                augmentation_config="",
                random_seed=0,
                mean_std_filepath="",
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                spectrum_type='linear',  # 'linear', 'mfcc', 'fbank'
                feat_dim=0,  # 'mfcc', 'fbank'
                delta_delta=False,  # 'mfcc', 'fbank'
                stride_ms=10.0,  # ms
                window_ms=20.0,  # ms
                n_fft=None,  # fft points
                max_freq=None,  # None for samplerate/2
                target_sample_rate=16000,  # target sample rate
                use_dB_normalization=True,
                target_dB=-20,
                dither=1.0,  # feature dither
                keep_transcription_text=False))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    @classmethod
    def from_config(cls, config):
        """Build a SpeechCollator object from a config.
-    @property
+        Args:
-    def vocab_dict(self):
+            config (yacs.config.CfgNode): configs object.
        return self._speech_featurizer.vocab_dict
-    @property
+        Returns:
-    def text_feature(self):
+            SpeechCollator: collator object.
-        return self._speech_featurizer.text_feature
+        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
        assert 'mean_std_filepath' in config.collator
        assert 'vocab_filepath' in config.collator
        assert 'spectrum_type' in config.collator
        assert 'n_fft' in config.collator
        assert config.collator
-    @property
+        if isinstance(config.collator.augmentation_config, (str, bytes)):
-    def feature_size(self):
+            if config.collator.augmentation_config:
-        return self._speech_featurizer.feature_size
+                aug_file = io.open(
                    config.collator.augmentation_config,
                    mode='r',
                    encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.collator.augmentation_config
            assert isinstance(aug_file, io.StringIO)
-    @property
+        speech_collator = cls(
-    def stride_ms(self):
+            aug_file=aug_file,
-        return self._speech_featurizer.stride_ms
+            random_seed=0,
            mean_std_filepath=config.collator.mean_std_filepath,
            unit_type=config.collator.unit_type,
            vocab_filepath=config.collator.vocab_filepath,
            spm_model_prefix=config.collator.spm_model_prefix,
            spectrum_type=config.collator.spectrum_type,
            feat_dim=config.collator.feat_dim,
            delta_delta=config.collator.delta_delta,
            stride_ms=config.collator.stride_ms,
            window_ms=config.collator.window_ms,
            n_fft=config.collator.n_fft,
            max_freq=config.collator.max_freq,
            target_sample_rate=config.collator.target_sample_rate,
            use_dB_normalization=config.collator.use_dB_normalization,
            target_dB=config.collator.target_dB,
            dither=config.collator.dither,
            keep_transcription_text=config.collator.keep_transcription_text)
        return speech_collator
 class TripletSpeechCollator(SpeechCollator):
    def process_utterance(self, audio_file, translation, transcript):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of audio file.
        :type audio_file: str | file
        :param translation: translation text.
        :type translation: str
        :return: Tuple of audio feature tensor and data of translation part,
                    where translation part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        spectrum, translation_part = super().process_utterance(audio_file,
                                                               translation)
        transcript_part = self._speech_featurizer.text_featurize(
            transcript, self.keep_transcription_text)
        return spectrum, translation_part, transcript_part
    def __call__(self, batch):
        """batch examples
        Args:
            batch (List[Dict]): batch is [dict(audio, text, ...)]
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)
        Returns:
            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
                utts: (B,)
                xs_pad : (B, Tmax, D)
                ilens: (B,)
                ys_pad : [(B, Umax), (B, Umax)]
                olens: [(B,), (B,)]
        """
        utts = []
        audios = []
        audio_lens = []
        translation_text = []
        translation_text_lens = []
        transcription_text = []
        transcription_text_lens = []
        for idx, item in enumerate(batch):
            utts.append(item['utt'])
            audio = item['feat']
            translation = item['text']
            transcription = item['text1']
            audio, translation, transcription = self.process_utterance(
                audio, translation, transcription)
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
            tokens = [[], []]
            for idx, text in enumerate([translation, transcription]):
                tokens[idx] = tokenids(text, self.keep_transcription_text)
            translation_text.append(tokens[0])
            translation_text_lens.append(tokens[0].shape[0])
            transcription_text.append(tokens[1])
            transcription_text_lens.append(tokens[1].shape[0])
        xs_pad = pad_list(audios, 0.0).astype(np.float32)  #[B, T, D]
        ilens = np.array(audio_lens).astype(np.int64)
        padded_translation = pad_list(translation_text,
                                      IGNORE_ID).astype(np.int64)
        translation_lens = np.array(translation_text_lens).astype(np.int64)
        padded_transcription = pad_list(transcription_text,
                                        IGNORE_ID).astype(np.int64)
        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
        ys_pad = (padded_translation, padded_transcription)
        olens = (translation_lens, transcription_lens)
        return utts, xs_pad, ilens, ys_pad, olens
--- a/deepspeech/io/collator_st.py
+++ b/deepspeech/io/collator_st.py
@ -1,631 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
 from collections import namedtuple
 from typing import Optional
 import kaldiio
 import numpy as np
 from yacs.config import CfgNode
 from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
 from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.io.utility import pad_sequence
 from deepspeech.utils.log import Log
 __all__ = ["SpeechCollator", "KaldiPrePorocessedCollator"]
 logger = Log(__name__).getlog()
 # namedtupe need global for pickle.
 TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
 class SpeechCollator():
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                augmentation_config="",
                random_seed=0,
                mean_std_filepath="",
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
                feat_dim=0,  # 'mfcc', 'fbank'
                delta_delta=False,  # 'mfcc', 'fbank'
                stride_ms=10.0,  # ms
                window_ms=20.0,  # ms
                n_fft=None,  # fft points
                max_freq=None,  # None for samplerate/2
                target_sample_rate=16000,  # target sample rate
                use_dB_normalization=True,
                target_dB=-20,
                dither=1.0,  # feature dither
                keep_transcription_text=False))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    @classmethod
    def from_config(cls, config):
        """Build a SpeechCollator object from a config.
        Args:
            config (yacs.config.CfgNode): configs object.
        Returns:
            SpeechCollator: collator object.
        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
        assert 'mean_std_filepath' in config.collator
        assert 'vocab_filepath' in config.collator
        assert 'specgram_type' in config.collator
        assert 'n_fft' in config.collator
        assert config.collator
        if isinstance(config.collator.augmentation_config, (str, bytes)):
            if config.collator.augmentation_config:
                aug_file = io.open(
                    config.collator.augmentation_config,
                    mode='r',
                    encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.collator.augmentation_config
            assert isinstance(aug_file, io.StringIO)
        speech_collator = cls(
            aug_file=aug_file,
            random_seed=0,
            mean_std_filepath=config.collator.mean_std_filepath,
            unit_type=config.collator.unit_type,
            vocab_filepath=config.collator.vocab_filepath,
            spm_model_prefix=config.collator.spm_model_prefix,
            specgram_type=config.collator.specgram_type,
            feat_dim=config.collator.feat_dim,
            delta_delta=config.collator.delta_delta,
            stride_ms=config.collator.stride_ms,
            window_ms=config.collator.window_ms,
            n_fft=config.collator.n_fft,
            max_freq=config.collator.max_freq,
            target_sample_rate=config.collator.target_sample_rate,
            use_dB_normalization=config.collator.use_dB_normalization,
            target_dB=config.collator.target_dB,
            dither=config.collator.dither,
            keep_transcription_text=config.collator.keep_transcription_text)
        return speech_collator
    def __init__(
            self,
            aug_file,
            mean_std_filepath,
            vocab_filepath,
            spm_model_prefix,
            random_seed=0,
            unit_type="char",
            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
            feat_dim=0,  # 'mfcc', 'fbank'
            delta_delta=False,  # 'mfcc', 'fbank'
            stride_ms=10.0,  # ms
            window_ms=20.0,  # ms
            n_fft=None,  # fft points
            max_freq=None,  # None for samplerate/2
            target_sample_rate=16000,  # target sample rate
            use_dB_normalization=True,
            target_dB=-20,
            dither=1.0,
            keep_transcription_text=True):
        """SpeechCollator Collator
        Args:
            unit_type(str): token unit type, e.g. char, word, spm
            vocab_filepath (str): vocab file path.
            mean_std_filepath (str): mean and std file path, which suffix is *.npy
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
            window_ms (float, optional): window size in ms. Defaults to 20.0.
            n_fft (int, optional): fft points for rfft. Defaults to None.
            max_freq (int, optional): max cut freq. Defaults to None.
            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
            target_dB (int, optional): target dB. Defaults to -20.
            random_seed (int, optional): for random generator. Defaults to 0.
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
            if ``keep_transcription_text`` is False, text is token ids else is raw string.
        Do augmentations
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one batch.
        """
        self._keep_transcription_text = keep_transcription_text
        self._local_data = TarLocalData(tar2info={}, tar2object={})
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=aug_file.read(), random_seed=random_seed)
        self._normalizer = FeatureNormalizer(
            mean_std_filepath) if mean_std_filepath else None
        self._stride_ms = stride_ms
        self._target_sample_rate = target_sample_rate
        self._speech_featurizer = SpeechFeaturizer(
            unit_type=unit_type,
            vocab_filepath=vocab_filepath,
            spm_model_prefix=spm_model_prefix,
            specgram_type=specgram_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
            stride_ms=stride_ms,
            window_ms=window_ms,
            n_fft=n_fft,
            max_freq=max_freq,
            target_sample_rate=target_sample_rate,
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB,
            dither=dither)
    def _parse_tar(self, file):
        """Parse a tar file to get a tarfile object
        and a map containing tarinfoes
        """
        result = {}
        f = tarfile.open(file)
        for tarinfo in f.getmembers():
            result[tarinfo.name] = tarinfo
        return f, result
    def _subfile_from_tar(self, file):
        """Get subfile object from tar.
        It will return a subfile object from tar file
        and cached tar file info for next reading request.
        """
        tarpath, filename = file.split(':', 1)[1].split('#', 1)
        if 'tar2info' not in self._local_data.__dict__:
            self._local_data.tar2info = {}
        if 'tar2object' not in self._local_data.__dict__:
            self._local_data.tar2object = {}
        if tarpath not in self._local_data.tar2info:
            object, infoes = self._parse_tar(tarpath)
            self._local_data.tar2info[tarpath] = infoes
            self._local_data.tar2object[tarpath] = object
        return self._local_data.tar2object[tarpath].extractfile(
            self._local_data.tar2info[tarpath][filename])
    @property
    def manifest(self):
        return self._manifest
    @property
    def vocab_size(self):
        return self._speech_featurizer.vocab_size
    @property
    def vocab_list(self):
        return self._speech_featurizer.vocab_list
    @property
    def vocab_dict(self):
        return self._speech_featurizer.vocab_dict
    @property
    def text_feature(self):
        return self._speech_featurizer.text_feature
    @property
    def feature_size(self):
        return self._speech_featurizer.feature_size
    @property
    def stride_ms(self):
        return self._speech_featurizer.stride_ms
    def process_utterance(self, audio_file, translation):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of audio file.
        :type audio_file: str | file
        :param translation: translation text.
        :type translation: str
        :return: Tuple of audio feature tensor and data of translation part,
                 where translation part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), translation)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, translation)
        # audio augment
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, translation_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        return specgram, translation_part
    def __call__(self, batch):
        """batch examples
        Args:
            batch ([List]): batch is (audio, text)
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)
        Returns:
            tuple(audio, text, audio_lens, text_lens): batched data.
                audio : (B, Tmax, D)
                audio_lens: (B)
                text : (B, Umax)
                text_lens: (B)
        """
        audios = []
        audio_lens = []
        texts = []
        text_lens = []
        utts = []
        for utt, audio, text in batch:
            audio, text = self.process_utterance(audio, text)
            #utt
            utts.append(utt)
            # audio
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
            # text
            # for training, text is token ids
            # else text is string, convert to unicode ord
            tokens = []
            if self._keep_transcription_text:
                assert isinstance(text, str), (type(text), text)
                tokens = [ord(t) for t in text]
            else:
                tokens = text  # token ids
            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
                tokens, dtype=np.int64)
            texts.append(tokens)
            text_lens.append(tokens.shape[0])
        padded_audios = pad_sequence(
            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
        audio_lens = np.array(audio_lens).astype(np.int64)
        padded_texts = pad_sequence(
            texts, padding_value=IGNORE_ID).astype(np.int64)
        text_lens = np.array(text_lens).astype(np.int64)
        return utts, padded_audios, audio_lens, padded_texts, text_lens
 class TripletSpeechCollator(SpeechCollator):
    def process_utterance(self, audio_file, translation, transcript):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of audio file.
        :type audio_file: str | file
        :param translation: translation text.
        :type translation: str
        :return: Tuple of audio feature tensor and data of translation part,
                    where translation part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), translation)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, translation)
        # audio augment
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, translation_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        transcript_part = self._speech_featurizer._text_featurizer.featurize(
            transcript)
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        return specgram, translation_part, transcript_part
    def __call__(self, batch):
        """batch examples
        Args:
            batch ([List]): batch is (audio, text)
                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)
        Returns:
            tuple(audio, text, audio_lens, text_lens): batched data.
                audio : (B, Tmax, D)
                audio_lens: (B)
                text : (B, Umax)
                text_lens: (B)
        """
        audios = []
        audio_lens = []
        translation_text = []
        translation_text_lens = []
        transcription_text = []
        transcription_text_lens = []
        utts = []
        for utt, audio, translation, transcription in batch:
            audio, translation, transcription = self.process_utterance(
                audio, translation, transcription)
            #utt
            utts.append(utt)
            # audio
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
            # text
            # for training, text is token ids
            # else text is string, convert to unicode ord
            tokens = [[], []]
            for idx, text in enumerate([translation, transcription]):
                if self._keep_transcription_text:
                    assert isinstance(text, str), (type(text), text)
                    tokens[idx] = [ord(t) for t in text]
                else:
                    tokens[idx] = text  # token ids
                tokens[idx] = tokens[idx] if isinstance(
                    tokens[idx], np.ndarray) else np.array(
                        tokens[idx], dtype=np.int64)
            translation_text.append(tokens[0])
            translation_text_lens.append(tokens[0].shape[0])
            transcription_text.append(tokens[1])
            transcription_text_lens.append(tokens[1].shape[0])
        padded_audios = pad_sequence(
            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
        audio_lens = np.array(audio_lens).astype(np.int64)
        padded_translation = pad_sequence(
            translation_text, padding_value=IGNORE_ID).astype(np.int64)
        translation_lens = np.array(translation_text_lens).astype(np.int64)
        padded_transcription = pad_sequence(
            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
        return utts, padded_audios, audio_lens, (
            padded_translation, padded_transcription), (translation_lens,
                                                        transcription_lens)
 class KaldiPrePorocessedCollator(SpeechCollator):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                augmentation_config="",
                random_seed=0,
                unit_type="char",
                vocab_filepath="",
                spm_model_prefix="",
                feat_dim=0,
                stride_ms=10.0,
                keep_transcription_text=False))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
    @classmethod
    def from_config(cls, config):
        """Build a SpeechCollator object from a config.
        Args:
            config (yacs.config.CfgNode): configs object.
        Returns:
            SpeechCollator: collator object.
        """
        assert 'augmentation_config' in config.collator
        assert 'keep_transcription_text' in config.collator
        assert 'vocab_filepath' in config.collator
        assert config.collator
        if isinstance(config.collator.augmentation_config, (str, bytes)):
            if config.collator.augmentation_config:
                aug_file = io.open(
                    config.collator.augmentation_config,
                    mode='r',
                    encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.collator.augmentation_config
            assert isinstance(aug_file, io.StringIO)
        speech_collator = cls(
            aug_file=aug_file,
            random_seed=0,
            unit_type=config.collator.unit_type,
            vocab_filepath=config.collator.vocab_filepath,
            spm_model_prefix=config.collator.spm_model_prefix,
            feat_dim=config.collator.feat_dim,
            stride_ms=config.collator.stride_ms,
            keep_transcription_text=config.collator.keep_transcription_text)
        return speech_collator
    def __init__(self,
                 aug_file,
                 vocab_filepath,
                 spm_model_prefix,
                 random_seed=0,
                 unit_type="char",
                 feat_dim=0,
                 stride_ms=10.0,
                 keep_transcription_text=True):
        """SpeechCollator Collator
        Args:
            unit_type(str): token unit type, e.g. char, word, spm
            vocab_filepath (str): vocab file path.
            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            random_seed (int, optional): for random generator. Defaults to 0.
            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
            if ``keep_transcription_text`` is False, text is token ids else is raw string.
        Do augmentations
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one batch.
        """
        self._keep_transcription_text = keep_transcription_text
        self._feat_dim = feat_dim
        self._stride_ms = stride_ms
        self._local_data = TarLocalData(tar2info={}, tar2object={})
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=aug_file.read(), random_seed=random_seed)
        self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath,
                                               spm_model_prefix)
    def process_utterance(self, audio_file, translation):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of kaldi processed feature.
        :type audio_file: str | file
        :param translation: Translation text.
        :type translation: str
        :return: Tuple of audio feature tensor and data of translation part,
                 where translation part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        specgram = kaldiio.load_mat(audio_file)
        assert specgram.shape[
            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
                self._feat_dim, specgram.shape[1])
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        if self._keep_transcription_text:
            return specgram, translation
        else:
            text_ids = self._text_featurizer.featurize(translation)
            return specgram, text_ids
 class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):
    def process_utterance(self, audio_file, translation, transcript):
        """Load, augment, featurize and normalize for speech data.
        :param audio_file: Filepath or file object of kali processed feature.
        :type audio_file: str | file
        :param translation: Translation text.
        :type translation: str
        :param transcript: Transcription text.
        :type transcript: str
        :return: Tuple of audio feature tensor and data of translation and transcription parts,
                 where translation and transcription parts could be token ids or text.
        :rtype: tuple of (2darray, (list, list))
        """
        specgram = kaldiio.load_mat(audio_file)
        assert specgram.shape[
            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
                self._feat_dim, specgram.shape[1])
        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        if self._keep_transcription_text:
            return specgram, translation, transcript
        else:
            translation_text_ids = self._text_featurizer.featurize(translation)
            transcript_text_ids = self._text_featurizer.featurize(transcript)
            return specgram, translation_text_ids, transcript_text_ids
    def __call__(self, batch):
        """batch examples
        Args:
            batch ([List]): batch is (audio, text)
                audio (np.ndarray) shape (T, D)
                translation (List[int] or str): shape (U,)
                transcription (List[int] or str): shape (V,)
        Returns:
            tuple(audio, text, audio_lens, text_lens): batched data.
                audio : (B, Tmax, D)
                audio_lens: (B)
                translation_text : (B, Umax)
                translation_text_lens: (B)
                transcription_text : (B, Vmax)
                transcription_text_lens: (B)
        """
        audios = []
        audio_lens = []
        translation_text = []
        translation_text_lens = []
        transcription_text = []
        transcription_text_lens = []
        utts = []
        for utt, audio, translation, transcription in batch:
            audio, translation, transcription = self.process_utterance(
                audio, translation, transcription)
            #utt
            utts.append(utt)
            # audio
            audios.append(audio)  # [T, D]
            audio_lens.append(audio.shape[0])
            # text
            # for training, text is token ids
            # else text is string, convert to unicode ord
            tokens = [[], []]
            for idx, text in enumerate([translation, transcription]):
                if self._keep_transcription_text:
                    assert isinstance(text, str), (type(text), text)
                    tokens[idx] = [ord(t) for t in text]
                else:
                    tokens[idx] = text  # token ids
                tokens[idx] = tokens[idx] if isinstance(
                    tokens[idx], np.ndarray) else np.array(
                        tokens[idx], dtype=np.int64)
            translation_text.append(tokens[0])
            translation_text_lens.append(tokens[0].shape[0])
            transcription_text.append(tokens[1])
            transcription_text_lens.append(tokens[1].shape[0])
        padded_audios = pad_sequence(
            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
        audio_lens = np.array(audio_lens).astype(np.int64)
        padded_translation = pad_sequence(
            translation_text, padding_value=IGNORE_ID).astype(np.int64)
        translation_lens = np.array(translation_text_lens).astype(np.int64)
        padded_transcription = pad_sequence(
            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
        transcription_lens = np.array(transcription_text_lens).astype(np.int64)
        return utts, padded_audios, audio_lens, (
            padded_translation, padded_transcription), (translation_lens,
                                                        transcription_lens)
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
-__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
+__all__ = ["ManifestDataset", "TransformDataset"]
 logger = Log(__name__).getlog()
@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
        return len(self._manifest)
    def __getitem__(self, idx):
-        instance = self._manifest[idx]
+        return self._manifest[idx]
        return instance["utt"], instance["feat"], instance["text"]
 class TripletManifestDataset(ManifestDataset):
    """
        For Joint Training of Speech Translation and ASR.
        text: translation,
        text1: transcript.
    """
    def __getitem__(self, idx):
        instance = self._manifest[idx]
        return instance["utt"], instance["feat"], instance["text"], instance[
            "text1"]
 class TransformDataset(Dataset):
@ -273,5 +259,4 @@ class AudioDataset(Dataset):
        return len(self.minibatch)
    def __getitem__(self, idx):
-        instance = self.minibatch[idx]
+        return self.minibatch[idx]
        return instance["utt"], instance["feat"], instance["text"]
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@ -321,6 +321,22 @@ class LoadInputsAndTargets():
            raise NotImplementedError(
                "Not supported: loader_type={}".format(filetype))
    def file_type(self, filepath):
        suffix = filepath.split(":")[0].split('.')[-1]
        if suffix == 'ark':
            return 'mat'
        elif suffix == 'scp':
            return 'scp'
        elif suffix == 'npy':
            return 'npy'
        elif suffix == 'npz':
            return 'npz'
        elif suffix in ['wav', 'flac']:
            # PCM16
            return 'sound'
        else:
            raise ValueError(f"Not support filetype: {suffix}")
 class SoundHDF5File():
    """Collecting sound files to a HDF5 file
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -49,7 +49,7 @@ class CTCDecoder(nn.Layer):
            dropout_rate (float): dropout rate (0.0 ~ 1.0)
            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
            batch_average (bool): do batch dim wise average.
-            grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
+            grad_norm_type (str): one of 'instance', 'batch', 'frame', None.
        """
        assert check_argument_types()
        super().__init__()
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -49,6 +49,8 @@ class CTCLoss(nn.Layer):
            self.norm_by_batchsize = True
        elif grad_norm_type == 'frame':
            self.norm_by_total_logits_len = True
        else:
            raise ValueError(f"CTCLoss Grad Norm no support {grad_norm_type}")
    def forward(self, logits, ys_pad, hlens, ys_lens):
        """Compute CTC loss.
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -263,6 +263,7 @@ class Trainer():
                            msg += f"{v:>.8f}" if isinstance(v,
                                                             float) else f"{v}"
                            msg += ","
                        msg = msg[:-1]  # remove the last ","
                        logger.info(msg)
                        data_start_time = time.time()
                except Exception as e:
--- a/docs/src/data_preparation.md
+++ b/docs/src/data_preparation.md
@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
 ```bash
 python3 utils/compute_mean_std.py \
 --num_samples 2000 \
--specgram_type linear \
+--spectrum_type linear \
 --manifest_path examples/librispeech/data/manifest.train \
 --output_path examples/librispeech/data/mean_std.npz
 ```
--- a/docs/src/deepspeech_architecture.md
+++ b/docs/src/deepspeech_architecture.md
@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
 cd examples/aishell/s0
 python3 ../../../utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-     --specgram_type="linear" \
+     --spectrum_type="linear" \
     --delta_delta=false \
     --stride_ms=10.0 \
     --window_ms=20.0 \
--- a/docs/src/released_model.md
+++ b/docs/src/released_model.md
@ -1,21 +1,21 @@
 # Released Models
 ## Acoustic Model Released in paddle 2.X
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
+:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
-[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 | 151 h
+[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
-[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 | 151 h
+[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
-[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 | 151 h
+[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
-[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 | 151 h
+[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
-[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0325 | 960 h
+[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
-[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0544 | 960 h
+[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
 ## Acoustic Model Transformed from paddle 1.8
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
+:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
-[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 | 151 h|
+[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
-[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers | 0.0685| 960 h|
+[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
-[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers | 0.0541 | 8628 h|
+[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
--- a/examples/1xt2x/aishell/conf/deepspeech2.yaml
+++ b/examples/1xt2x/aishell/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/1xt2x/librispeech/conf/deepspeech2.yaml
+++ b/examples/1xt2x/librispeech/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear #linear, mfcc, fbank
+  spectrum_type: linear #linear, mfcc, fbank
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
    --delta_delta=false \
    --stride_ms=10.0 \
    --window_ms=20.0 \
--- a/examples/aishell/s0/local/test_hub.sh
+++ b/examples/aishell/s0/local/test_hub.sh
@ -0,0 +1,36 @@
 #!/bin/bash
 if [ $# != 4 ];then
    echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 ckpt_prefix=$2
 model_type=$3
 audio_file=$4
 # download language model
 bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
   exit 1
 fi
 python3 -u ${BIN_DIR}/test_hub.py \
 --nproc ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
 --model_type ${model_type} \
 --audio_file ${audio_file}
 if [ $? -ne 0 ]; then
    echo "Failed in evaluation!"
    exit 1
 fi
 exit 0
--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
@ -15,6 +15,8 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 audio_file="data/tmp.wav"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
@ -44,3 +46,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test export ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    # test a single .wav file
    CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
 fi
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --stride_ms=10.0 \
--- a/examples/aishell/s1/local/test.sh
+++ b/examples/aishell/s1/local/test.sh
@ -38,7 +38,8 @@ for type in attention ctc_greedy_search; do
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
    --opts decoding.batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -56,7 +57,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
    --opts decoding.batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --stride_ms=10.0 \
--- a/examples/callcenter/s1/local/test.sh
+++ b/examples/callcenter/s1/local/test.sh
@ -32,7 +32,8 @@ for type in attention ctc_greedy_search; do
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
    --opts decoding.batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -50,7 +51,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
    --opts decoding.batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=2000 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
    --stride_ms=10.0 \
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
@ -18,7 +18,7 @@ collator:
  # augmentation_config: conf/augmentation.json
  batch_size: 10
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
@ -18,7 +18,7 @@ collator:
  # augmentation_config: conf/augmentation.json
  batch_size: 10
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@ -17,7 +17,7 @@ collator:
  augmentation_config: ""
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  random_seed: 0
  spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
  feat_dim: 
  delta_delta: False
  stride_ms: 10.0
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.tiny.raw" \
    --num_samples=64 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
    --stride_ms=10.0 \
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@ -18,7 +18,7 @@ collator:
  augmentation_config: conf/augmentation.json
  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.tiny.raw" \
    --num_samples=64 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --sample_rate=16000 \
--- a/examples/tiny/s1/local/test.sh
+++ b/examples/tiny/s1/local/test.sh
@ -35,7 +35,8 @@ for type in attention ctc_greedy_search; do
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
    --opts decoding.batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@ -51,7 +52,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    --opts decoding.decoding_method ${type} \
    --opts decoding.batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
@ -30,12 +30,12 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES= ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES= ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--- a/hub/requirements.txt
+++ b/hub/requirements.txt
@ -0,0 +1,26 @@
 coverage
 gpustat
 jsonlines
 kaldiio
 llvmlite==0.31.0
 loguru
 numba==0.47.0
 numpy==1.18.5
 Pillow
 pre-commit
 pybind11
 python-speech-features
 resampy==0.2.2
 sacrebleu
 scipy==1.2.1
 sentencepiece
 snakeviz
 SoundFile==0.9.0.post1
 sox
 soxbindings
 tensorboardX
 textgrid
 tqdm
 typeguard
 visualdl==2.2.0
 yacs
--- a/hub/setup_hub.sh
+++ b/hub/setup_hub.sh
@ -0,0 +1,66 @@
 #! /usr/bin/env  bash
 cd .. >> /dev/null
 source utils/log.sh
 SUDO='sudo'
 if [ $(id -u) -eq 0 ]; then
  SUDO=''
 fi
 if [ -e /etc/lsb-release ];then
    ${SUDO} apt-get update -y
    ${SUDO} apt-get install -y jq vim tig tree sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
    if [ $? != 0 ]; then
        error_msg "Please using Ubuntu or install pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev by user."
        exit -1
    fi
 fi
 source tools/venv/bin/activate
 cd -
 #install python dependencies
 if [ -f "requirements.txt" ]; then
    pip3 install -r requirements.txt
 fi
 if [ $? != 0 ]; then
    error_msg "Install python dependencies failed !!!"
    exit 1
 fi
 cd .. >> /dev/null
 # install package libsndfile
 python3 -c "import soundfile"
 if [ $? != 0 ]; then
    info_msg "Install package libsndfile into default system path."
    wget "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
    if [ $? != 0 ]; then
        error_msg "Download libsndfile-1.0.28.tar.gz failed !!!"
        exit 1
    fi
    tar -zxvf libsndfile-1.0.28.tar.gz
    cd libsndfile-1.0.28
    ./configure > /dev/null && make > /dev/null && make install > /dev/null
    cd ..
    rm -rf libsndfile-1.0.28
    rm libsndfile-1.0.28.tar.gz
 fi
 # install decoders
 python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")"
 if [ $? != 0 ]; then
    cd deepspeech/decoders/swig > /dev/null
    sh setup.sh
    cd - > /dev/null
 fi
 python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")"
 if [ $? != 0 ]; then
   error_msg "Please check why decoder install error!"
   exit -1
 fi
 info_msg "Install all dependencies successfully."
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
-add_arg('specgram_type',    str,
+add_arg('spectrum_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc, fbank.",
        choices=['linear', 'mfcc', 'fbank'])
@ -58,7 +58,7 @@ def main():
    augmentation_pipeline = AugmentationPipeline('{}')
    audio_featurizer = AudioFeaturizer(
-        specgram_type=args.specgram_type,
+        spectrum_type=args.spectrum_type,
        feat_dim=args.feat_dim,
        delta_delta=args.delta_delta,
        stride_ms=args.stride_ms,
--- a/utils/format_data.py
+++ b/utils/format_data.py
@ -26,7 +26,7 @@ from deepspeech.utils.utility import print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi")
+add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp")
 add_arg('cmvn_path',       str,
        'examples/librispeech/data/mean_std.json',
        "Filepath of cmvn.")
@ -76,6 +76,7 @@ def main():
            assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
            if args.feat_type == 'raw':
                feat_shape.append(feat_dim)
                line_json['filetype'] = 'sound'
            else: # kaldi
                raise NotImplementedError('no support kaldi feat now!')
            fout.write(json.dumps(line_json) + '\n')