diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py
index f5a514c7..9a34cbdc 100644
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -31,7 +31,6 @@ from yacs.config import CfgNode
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.collator import TripletSpeechCollator
 from deepspeech.io.dataset import ManifestDataset
-from deepspeech.io.dataset import TripletManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2_st import U2STModel
@@ -249,12 +248,11 @@ class U2STTrainer(Trainer):
         config.collator.keep_transcription_text = False
 
         # train/valid dataset, return token ids
-        Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset
         config.data.manifest = config.data.train_manifest
-        train_dataset = Dataset.from_config(config)
+        train_dataset = ManifestDataset.from_config(config)
 
         config.data.manifest = config.data.dev_manifest
-        dev_dataset = Dataset.from_config(config)
+        dev_dataset = ManifestDataset.from_config(config)
 
         if config.model.model_conf.asr_weight > 0.:
             Collator = TripletSpeechCollator
diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py
index 4c40c847..6f3b646c 100644
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -24,15 +24,15 @@ class AudioFeaturizer():
 
     Currently, it supports feature types of linear spectrogram and mfcc.
 
-    :param specgram_type: Specgram feature type. Options: 'linear'.
-    :type specgram_type: str
+    :param spectrum_type: Specgram feature type. Options: 'linear'.
+    :type spectrum_type: str
     :param stride_ms: Striding size (in milliseconds) for generating frames.
     :type stride_ms: float
     :param window_ms: Window size (in milliseconds) for generating frames.
     :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
+    :param max_freq: When spectrum_type is 'linear', only FFT bins
                      corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_feq is the
+                     returned; when spectrum_type is 'mfcc', max_feq is the
                      highest band edge of mel filters.
     :types max_freq: None|float
     :param target_sample_rate: Audio are resampled (if upsampling or
@@ -47,7 +47,7 @@ class AudioFeaturizer():
     """
 
     def __init__(self,
-                 specgram_type: str='linear',
+                 spectrum_type: str='linear',
                  feat_dim: int=None,
                  delta_delta: bool=False,
                  stride_ms=10.0,
@@ -58,7 +58,7 @@ class AudioFeaturizer():
                  use_dB_normalization=True,
                  target_dB=-20,
                  dither=1.0):
-        self._specgram_type = specgram_type
+        self._spectrum_type = spectrum_type
         # mfcc and fbank using `feat_dim`
         self._feat_dim = feat_dim
         # mfcc and fbank using `delta-delta`
@@ -113,27 +113,27 @@ class AudioFeaturizer():
     def feature_size(self):
         """audio feature size"""
         feat_dim = 0
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
             fft_point = self._window_ms if self._fft_point is None else self._fft_point
             feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
                            1)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
             # mfcc, delta, delta-delta
             feat_dim = int(self._feat_dim *
                            3) if self._delta_delta else int(self._feat_dim)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
             # fbank, delta, delta-delta
             feat_dim = int(self._feat_dim *
                            3) if self._delta_delta else int(self._feat_dim)
         else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
         return feat_dim
 
     def _compute_specgram(self, audio_segment):
         """Extract various audio features."""
         sample_rate = audio_segment.sample_rate
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
             samples = audio_segment.samples
             return self._compute_linear_specgram(
                 samples,
@@ -141,7 +141,7 @@ class AudioFeaturizer():
                 stride_ms=self._stride_ms,
                 window_ms=self._window_ms,
                 max_freq=self._max_freq)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
             samples = audio_segment.to('int16')
             return self._compute_mfcc(
                 samples,
@@ -152,7 +152,7 @@ class AudioFeaturizer():
                 max_freq=self._max_freq,
                 dither=self._dither,
                 delta_delta=self._delta_delta)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
             samples = audio_segment.to('int16')
             return self._compute_fbank(
                 samples,
@@ -164,8 +164,8 @@ class AudioFeaturizer():
                 dither=self._dither,
                 delta_delta=self._delta_delta)
         else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
 
     def _specgram_real(self, samples, window_size, stride_size, sample_rate):
         """Compute the spectrogram for samples from a real signal."""
diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py
index f9f7d7c2..7471d164 100644
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 
 
 class SpeechFeaturizer():
-    """Speech featurizer, for extracting features from both audio and transcript
-    contents of SpeechSegment.
-
-    Currently, for audio parts, it supports feature types of linear
-    spectrogram and mfcc; for transcript parts, it only supports char-level
-    tokenizing and conversion into a list of token indices. Note that the
-    token indexing order follows the given vocabulary file.
-
-    :param vocab_filepath: Filepath to load vocabulary for token indices
-                           conversion.
-    :type specgram_type: str
-    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
-    :type specgram_type: str
-    :param stride_ms: Striding size (in milliseconds) for generating frames.
-    :type stride_ms: float
-    :param window_ms: Window size (in milliseconds) for generating frames.
-    :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
-                     corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_freq is the
-                     highest band edge of mel filters.
-    :types max_freq: None|float
-    :param target_sample_rate: Speech are resampled (if upsampling or
-                               downsampling is allowed) to this before
-                               extracting spectrogram features.
-    :type target_sample_rate: float
-    :param use_dB_normalization: Whether to normalize the audio to a certain
-                                 decibels before extracting the features.
-    :type use_dB_normalization: bool
-    :param target_dB: Target audio decibels for normalization.
-    :type target_dB: float
+    """Speech and Text feature extraction.
     """
 
     def __init__(self,
                  unit_type,
                  vocab_filepath,
                  spm_model_prefix=None,
-                 specgram_type='linear',
+                 spectrum_type='linear',
                  feat_dim=None,
                  delta_delta=False,
                  stride_ms=10.0,
@@ -70,7 +40,7 @@ class SpeechFeaturizer():
         self.window_ms = window_ms
 
         self.audio_feature = AudioFeaturizer(
-            specgram_type=specgram_type,
+            spectrum_type=spectrum_type,
             feat_dim=feat_dim,
             delta_delta=delta_delta,
             stride_ms=stride_ms,
diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py
index 2a581232..f5fc3097 100644
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -15,6 +15,7 @@
 import json
 import math
 import tarfile
+from collections import namedtuple
 from typing import List
 from typing import Optional
 from typing import Text
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index c5c0a414..553ffcb5 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
 logger = Log(__name__).getlog()
 
 
+def tokenids(text, keep_transcription_text):
+    # for training text is token ids 
+    tokens = text  # token ids
+
+    if keep_transcription_text:
+        # text is string, convert to unicode ord
+        assert isinstance(text, str), (type(text), text)
+        tokens = [ord(t) for t in text]
+
+    tokens = np.array(tokens, dtype=np.int64)
+    return tokens
+
+
 class SpeechCollatorBase():
     def __init__(
             self,
@@ -150,7 +163,6 @@ class SpeechCollatorBase():
             # extract speech feature
             spectrum, transcript_part = self._speech_featurizer.featurize(
                 speech_segment, self.keep_transcription_text)
-
             # CMVN spectrum
             if self._normalizer:
                 spectrum = self._normalizer.apply(spectrum)
@@ -163,38 +175,35 @@ class SpeechCollatorBase():
         """batch examples
 
         Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                 audio (np.ndarray) shape (T, D)
                 text (List[int] or str): shape (U,)
 
         Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : (B, Umax)
+                olens: (B,)
         """
         audios = []
         audio_lens = []
         texts = []
         text_lens = []
         utts = []
-        for utt, audio, text in batch:
+
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            text = item['text']
             audio, text = self.process_utterance(audio, text)
-            #utt
-            utts.append(utt)
-            # audio
+
             audios.append(audio)  # [T, D]
             audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids, else text is string, convert to unicode ord
-            tokens = []
-            if self.keep_transcription_text:
-                assert isinstance(text, str), (type(text), text)
-                tokens = [ord(t) for t in text]
-            else:
-                tokens = text  # token ids
-            tokens = np.array(tokens, dtype=np.int64)
+
+            tokens = tokenids(text, self.keep_transcription_text)
             texts.append(tokens)
             text_lens.append(tokens.shape[0])
 
@@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator):
         """batch examples
 
         Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                 audio (np.ndarray) shape (T, D)
                 text (List[int] or str): shape (U,)
 
         Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : [(B, Umax), (B, Umax)]
+                olens: [(B,), (B,)]
         """
+        utts = []
         audios = []
         audio_lens = []
         translation_text = []
@@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator):
         transcription_text = []
         transcription_text_lens = []
 
-        utts = []
-        for utt, audio, translation, transcription in batch:
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            translation = item['text']
+            transcription = item['text1']
             audio, translation, transcription = self.process_utterance(
                 audio, translation, transcription)
-            #utt
-            utts.append(utt)
-            # audio
+
             audios.append(audio)  # [T, D]
             audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
+
             tokens = [[], []]
             for idx, text in enumerate([translation, transcription]):
-                if self.keep_transcription_text:
-                    assert isinstance(text, str), (type(text), text)
-                    tokens[idx] = [ord(t) for t in text]
-                else:
-                    tokens[idx] = text  # token ids
-                tokens[idx] = np.array(tokens[idx], dtype=np.int64)
+                tokens[idx] = tokenids(text, self.keep_transcription_text)
 
             translation_text.append(tokens[0])
             translation_text_lens.append(tokens[0].shape[0])
             transcription_text.append(tokens[1])
             transcription_text_lens.append(tokens[1].shape[0])
 
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_translation = pad_sequence(
-            translation_text, padding_value=IGNORE_ID).astype(np.int64)
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)  #[B, T, D]
+        ilens = np.array(audio_lens).astype(np.int64)
+
+        padded_translation = pad_list(translation_text,
+                                      IGNORE_ID).astype(np.int64)
         translation_lens = np.array(translation_text_lens).astype(np.int64)
-        padded_transcription = pad_sequence(
-            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
+
+        padded_transcription = pad_list(transcription_text,
+                                        IGNORE_ID).astype(np.int64)
         transcription_lens = np.array(transcription_text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, (
-            padded_translation, padded_transcription), (translation_lens,
-                                                        transcription_lens)
+
+        ys_pad = (padded_translation, padded_transcription)
+        olens = (translation_lens, transcription_lens)
+        return utts, xs_pad, ilens, ys_pad, olens
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 56e53475..1945c5f7 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
 
-__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
+__all__ = ["ManifestDataset", "TransformDataset"]
 
 logger = Log(__name__).getlog()
 
@@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
         return len(self._manifest)
 
     def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"]
-
-
-class TripletManifestDataset(ManifestDataset):
-    """
-        For Joint Training of Speech Translation and ASR.
-        text: translation,
-        text1: transcript.
-    """
-
-    def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"], instance[
-            "text1"]
+        return self._manifest[idx]
 
 
 class TransformDataset(Dataset):
@@ -273,5 +259,4 @@ class AudioDataset(Dataset):
         return len(self.minibatch)
 
     def __getitem__(self, idx):
-        instance = self.minibatch[idx]
-        return instance["utt"], instance["feat"], instance["text"]
+        return self.minibatch[idx]
diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py
index 30ae98f0..e7c43a78 100644
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@@ -322,7 +322,7 @@ class LoadInputsAndTargets():
                 "Not supported: loader_type={}".format(filetype))
 
     def file_type(self, filepath):
-        suffix = filepath.split(":")[0].split('.')[1]
+        suffix = filepath.split(":")[0].split('.')[-1]
         if suffix == 'ark':
             return 'mat'
         elif suffix == 'scp':
diff --git a/docs/src/data_preparation.md b/docs/src/data_preparation.md
index a3d1b3eb..34d2a835 100644
--- a/docs/src/data_preparation.md
+++ b/docs/src/data_preparation.md
@@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
 ```bash
 python3 utils/compute_mean_std.py \
 --num_samples 2000 \
---specgram_type linear \
+--spectrum_type linear \
 --manifest_path examples/librispeech/data/manifest.train \
 --output_path examples/librispeech/data/mean_std.npz
 ```
diff --git a/docs/src/deepspeech_architecture.md b/docs/src/deepspeech_architecture.md
index b9344122..5a6ca886 100644
--- a/docs/src/deepspeech_architecture.md
+++ b/docs/src/deepspeech_architecture.md
@@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
 cd examples/aishell/s0
 python3 ../../../utils/compute_mean_std.py \
      --manifest_path="data/manifest.train.raw" \
-     --specgram_type="linear" \
+     --spectrum_type="linear" \
      --delta_delta=false \
      --stride_ms=10.0 \
      --window_ms=20.0 \
diff --git a/examples/1xt2x/aishell/conf/deepspeech2.yaml b/examples/1xt2x/aishell/conf/deepspeech2.yaml
index 6e745e9d..c2d69226 100644
--- a/examples/1xt2x/aishell/conf/deepspeech2.yaml
+++ b/examples/1xt2x/aishell/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
index fbc7466f..be51a9b9 100644
--- a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/1xt2x/librispeech/conf/deepspeech2.yaml
index edef0797..ad7fb2c1 100644
--- a/examples/1xt2x/librispeech/conf/deepspeech2.yaml
+++ b/examples/1xt2x/librispeech/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml
index 9560930a..ffefaeb3 100644
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml
index 7e87594c..cac599dc 100644
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear #linear, mfcc, fbank
+  spectrum_type: linear #linear, mfcc, fbank
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh
index b106f3f2..1312a12f 100755
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --stride_ms=10.0 \
     --window_ms=20.0 \
diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml
index 6f8ae135..9b563da2 100644
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml
index a4248459..dfa9a4b0 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh
index 8d5ac4d5..c05c3ea2 100755
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --stride_ms=10.0 \
diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/s1/conf/chunk_conformer.yaml
index f79b8eaa..a853658a 100644
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/s1/conf/conformer.yaml
index 3b08cc7a..bd4f4578 100644
--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh
index e2640ead..b2a495b4 100755
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --stride_ms=10.0 \
diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml
index 3f1a376f..47ef9421 100644
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   target_sample_rate: 16000
   max_freq: None
   n_fft: None
diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/s0/conf/deepspeech2_online.yaml
index 180a6205..e2f91094 100644
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   target_sample_rate: 16000
   max_freq: None
   n_fft: None
diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh
index b7180986..e3f7b325 100755
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=2000 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
     --stride_ms=10.0 \
diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml
index 92db20f6..872b560b 100644
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml
index e0bc3135..132a4f9d 100644
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml
index 78be249c..769ed5f5 100644
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml
index e4a06767..c9dc1413 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh
index 4ad476d3..2b6af229 100755
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml
index 92db20f6..872b560b 100644
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml
index e0bc3135..132a4f9d 100644
--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml
index 9a727413..bc87466e 100644
--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/s2/local/data.sh
index 4ad476d3..2b6af229 100755
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml
index 1aad86d2..8c03e328 100644
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
   # augmentation_config: conf/augmentation.json
   batch_size: 10
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
index 0144c40d..cbfae93e 100644
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
@@ -18,7 +18,7 @@ collator:
   # augmentation_config: conf/augmentation.json
   batch_size: 10
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh
index 32cfd9d7..43911c34 100755
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml
index c3b51996..1ae9acd0 100644
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@@ -17,7 +17,7 @@ collator:
   augmentation_config: ""
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh
index 1d16f454..f4be9048 100755
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index 40899655..a7940cb2 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/s0/conf/deepspeech2_online.yaml
index 0098a226..7e30409f 100644
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh
index 02fdb706..fabf2e40 100755
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.tiny.raw" \
     --num_samples=64 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
     --stride_ms=10.0 \
diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml
index be2e82f9..f3c7e1dd 100644
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml
index 93439a85..83005754 100644
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml
index 9bb67c44..628e3b77 100644
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index fcbe1da4..27ffcae4 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh
index 2aea250b..b5dbd581 100755
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.tiny.raw" \
     --num_samples=64 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py
index a468153d..0f63715a 100755
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
 
-add_arg('specgram_type',    str,
+add_arg('spectrum_type',    str,
         'linear',
         "Audio feature type. Options: linear, mfcc, fbank.",
         choices=['linear', 'mfcc', 'fbank'])
@@ -58,7 +58,7 @@ def main():
 
     augmentation_pipeline = AugmentationPipeline('{}')
     audio_featurizer = AudioFeaturizer(
-        specgram_type=args.specgram_type,
+        spectrum_type=args.spectrum_type,
         feat_dim=args.feat_dim,
         delta_delta=args.delta_delta,
         stride_ms=args.stride_ms,