diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index f5a514c7..9a34cbdc 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -31,7 +31,6 @@ from yacs.config import CfgNode from deepspeech.io.collator import SpeechCollator from deepspeech.io.collator import TripletSpeechCollator from deepspeech.io.dataset import ManifestDataset -from deepspeech.io.dataset import TripletManifestDataset from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.models.u2_st import U2STModel @@ -249,12 +248,11 @@ class U2STTrainer(Trainer): config.collator.keep_transcription_text = False # train/valid dataset, return token ids - Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset config.data.manifest = config.data.train_manifest - train_dataset = Dataset.from_config(config) + train_dataset = ManifestDataset.from_config(config) config.data.manifest = config.data.dev_manifest - dev_dataset = Dataset.from_config(config) + dev_dataset = ManifestDataset.from_config(config) if config.model.model_conf.asr_weight > 0.: Collator = TripletSpeechCollator diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py index 4c40c847..6f3b646c 100644 --- a/deepspeech/frontend/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -24,15 +24,15 @@ class AudioFeaturizer(): Currently, it supports feature types of linear spectrogram and mfcc. - :param specgram_type: Specgram feature type. Options: 'linear'. - :type specgram_type: str + :param spectrum_type: Specgram feature type. Options: 'linear'. + :type spectrum_type: str :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float - :param max_freq: When specgram_type is 'linear', only FFT bins + :param max_freq: When spectrum_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are - returned; when specgram_type is 'mfcc', max_feq is the + returned; when spectrum_type is 'mfcc', max_feq is the highest band edge of mel filters. :types max_freq: None|float :param target_sample_rate: Audio are resampled (if upsampling or @@ -47,7 +47,7 @@ class AudioFeaturizer(): """ def __init__(self, - specgram_type: str='linear', + spectrum_type: str='linear', feat_dim: int=None, delta_delta: bool=False, stride_ms=10.0, @@ -58,7 +58,7 @@ class AudioFeaturizer(): use_dB_normalization=True, target_dB=-20, dither=1.0): - self._specgram_type = specgram_type + self._spectrum_type = spectrum_type # mfcc and fbank using `feat_dim` self._feat_dim = feat_dim # mfcc and fbank using `delta-delta` @@ -113,27 +113,27 @@ class AudioFeaturizer(): def feature_size(self): """audio feature size""" feat_dim = 0 - if self._specgram_type == 'linear': + if self._spectrum_type == 'linear': fft_point = self._window_ms if self._fft_point is None else self._fft_point feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 + 1) - elif self._specgram_type == 'mfcc': + elif self._spectrum_type == 'mfcc': # mfcc, delta, delta-delta feat_dim = int(self._feat_dim * 3) if self._delta_delta else int(self._feat_dim) - elif self._specgram_type == 'fbank': + elif self._spectrum_type == 'fbank': # fbank, delta, delta-delta feat_dim = int(self._feat_dim * 3) if self._delta_delta else int(self._feat_dim) else: - raise ValueError("Unknown specgram_type %s. " - "Supported values: linear." % self._specgram_type) + raise ValueError("Unknown spectrum_type %s. " + "Supported values: linear." % self._spectrum_type) return feat_dim def _compute_specgram(self, audio_segment): """Extract various audio features.""" sample_rate = audio_segment.sample_rate - if self._specgram_type == 'linear': + if self._spectrum_type == 'linear': samples = audio_segment.samples return self._compute_linear_specgram( samples, @@ -141,7 +141,7 @@ class AudioFeaturizer(): stride_ms=self._stride_ms, window_ms=self._window_ms, max_freq=self._max_freq) - elif self._specgram_type == 'mfcc': + elif self._spectrum_type == 'mfcc': samples = audio_segment.to('int16') return self._compute_mfcc( samples, @@ -152,7 +152,7 @@ class AudioFeaturizer(): max_freq=self._max_freq, dither=self._dither, delta_delta=self._delta_delta) - elif self._specgram_type == 'fbank': + elif self._spectrum_type == 'fbank': samples = audio_segment.to('int16') return self._compute_fbank( samples, @@ -164,8 +164,8 @@ class AudioFeaturizer(): dither=self._dither, delta_delta=self._delta_delta) else: - raise ValueError("Unknown specgram_type %s. " - "Supported values: linear." % self._specgram_type) + raise ValueError("Unknown spectrum_type %s. " + "Supported values: linear." % self._spectrum_type) def _specgram_real(self, samples, window_size, stride_size, sample_rate): """Compute the spectrogram for samples from a real signal.""" diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index f9f7d7c2..7471d164 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer class SpeechFeaturizer(): - """Speech featurizer, for extracting features from both audio and transcript - contents of SpeechSegment. - - Currently, for audio parts, it supports feature types of linear - spectrogram and mfcc; for transcript parts, it only supports char-level - tokenizing and conversion into a list of token indices. Note that the - token indexing order follows the given vocabulary file. - - :param vocab_filepath: Filepath to load vocabulary for token indices - conversion. - :type specgram_type: str - :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'. - :type specgram_type: str - :param stride_ms: Striding size (in milliseconds) for generating frames. - :type stride_ms: float - :param window_ms: Window size (in milliseconds) for generating frames. - :type window_ms: float - :param max_freq: When specgram_type is 'linear', only FFT bins - corresponding to frequencies between [0, max_freq] are - returned; when specgram_type is 'mfcc', max_freq is the - highest band edge of mel filters. - :types max_freq: None|float - :param target_sample_rate: Speech are resampled (if upsampling or - downsampling is allowed) to this before - extracting spectrogram features. - :type target_sample_rate: float - :param use_dB_normalization: Whether to normalize the audio to a certain - decibels before extracting the features. - :type use_dB_normalization: bool - :param target_dB: Target audio decibels for normalization. - :type target_dB: float + """Speech and Text feature extraction. """ def __init__(self, unit_type, vocab_filepath, spm_model_prefix=None, - specgram_type='linear', + spectrum_type='linear', feat_dim=None, delta_delta=False, stride_ms=10.0, @@ -70,7 +40,7 @@ class SpeechFeaturizer(): self.window_ms = window_ms self.audio_feature = AudioFeaturizer( - specgram_type=specgram_type, + spectrum_type=spectrum_type, feat_dim=feat_dim, delta_delta=delta_delta, stride_ms=stride_ms, diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index 2a581232..f5fc3097 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -15,6 +15,7 @@ import json import math import tarfile +from collections import namedtuple from typing import List from typing import Optional from typing import Text diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index c5c0a414..553ffcb5 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"] logger = Log(__name__).getlog() +def tokenids(text, keep_transcription_text): + # for training text is token ids + tokens = text # token ids + + if keep_transcription_text: + # text is string, convert to unicode ord + assert isinstance(text, str), (type(text), text) + tokens = [ord(t) for t in text] + + tokens = np.array(tokens, dtype=np.int64) + return tokens + + class SpeechCollatorBase(): def __init__( self, @@ -150,7 +163,6 @@ class SpeechCollatorBase(): # extract speech feature spectrum, transcript_part = self._speech_featurizer.featurize( speech_segment, self.keep_transcription_text) - # CMVN spectrum if self._normalizer: spectrum = self._normalizer.apply(spectrum) @@ -163,38 +175,35 @@ class SpeechCollatorBase(): """batch examples Args: - batch ([List]): batch is (audio, text) + batch (List[Dict]): batch is [dict(audio, text, ...)] audio (np.ndarray) shape (T, D) text (List[int] or str): shape (U,) Returns: - tuple(audio, text, audio_lens, text_lens): batched data. - audio : (B, Tmax, D) - audio_lens: (B) - text : (B, Umax) - text_lens: (B) + tuple(utts, xs_pad, ilens, ys_pad, olens): batched data. + utts: (B,) + xs_pad : (B, Tmax, D) + ilens: (B,) + ys_pad : (B, Umax) + olens: (B,) """ audios = [] audio_lens = [] texts = [] text_lens = [] utts = [] - for utt, audio, text in batch: + + for idx, item in enumerate(batch): + utts.append(item['utt']) + + audio = item['feat'] + text = item['text'] audio, text = self.process_utterance(audio, text) - #utt - utts.append(utt) - # audio + audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) - # text - # for training, text is token ids, else text is string, convert to unicode ord - tokens = [] - if self.keep_transcription_text: - assert isinstance(text, str), (type(text), text) - tokens = [ord(t) for t in text] - else: - tokens = text # token ids - tokens = np.array(tokens, dtype=np.int64) + + tokens = tokenids(text, self.keep_transcription_text) texts.append(tokens) text_lens.append(tokens.shape[0]) @@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator): """batch examples Args: - batch ([List]): batch is (audio, text) + batch (List[Dict]): batch is [dict(audio, text, ...)] audio (np.ndarray) shape (T, D) text (List[int] or str): shape (U,) Returns: - tuple(audio, text, audio_lens, text_lens): batched data. - audio : (B, Tmax, D) - audio_lens: (B) - text : (B, Umax) - text_lens: (B) + tuple(utts, xs_pad, ilens, ys_pad, olens): batched data. + utts: (B,) + xs_pad : (B, Tmax, D) + ilens: (B,) + ys_pad : [(B, Umax), (B, Umax)] + olens: [(B,), (B,)] """ + utts = [] audios = [] audio_lens = [] translation_text = [] @@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator): transcription_text = [] transcription_text_lens = [] - utts = [] - for utt, audio, translation, transcription in batch: + for idx, item in enumerate(batch): + utts.append(item['utt']) + + audio = item['feat'] + translation = item['text'] + transcription = item['text1'] audio, translation, transcription = self.process_utterance( audio, translation, transcription) - #utt - utts.append(utt) - # audio + audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) - # text - # for training, text is token ids - # else text is string, convert to unicode ord + tokens = [[], []] for idx, text in enumerate([translation, transcription]): - if self.keep_transcription_text: - assert isinstance(text, str), (type(text), text) - tokens[idx] = [ord(t) for t in text] - else: - tokens[idx] = text # token ids - tokens[idx] = np.array(tokens[idx], dtype=np.int64) + tokens[idx] = tokenids(text, self.keep_transcription_text) translation_text.append(tokens[0]) translation_text_lens.append(tokens[0].shape[0]) transcription_text.append(tokens[1]) transcription_text_lens.append(tokens[1].shape[0]) - padded_audios = pad_sequence( - audios, padding_value=0.0).astype(np.float32) #[B, T, D] - audio_lens = np.array(audio_lens).astype(np.int64) - padded_translation = pad_sequence( - translation_text, padding_value=IGNORE_ID).astype(np.int64) + xs_pad = pad_list(audios, 0.0).astype(np.float32) #[B, T, D] + ilens = np.array(audio_lens).astype(np.int64) + + padded_translation = pad_list(translation_text, + IGNORE_ID).astype(np.int64) translation_lens = np.array(translation_text_lens).astype(np.int64) - padded_transcription = pad_sequence( - transcription_text, padding_value=IGNORE_ID).astype(np.int64) + + padded_transcription = pad_list(transcription_text, + IGNORE_ID).astype(np.int64) transcription_lens = np.array(transcription_text_lens).astype(np.int64) - return utts, padded_audios, audio_lens, ( - padded_translation, padded_transcription), (translation_lens, - transcription_lens) + + ys_pad = (padded_translation, padded_transcription) + olens = (translation_lens, transcription_lens) + return utts, xs_pad, ilens, ys_pad, olens diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 56e53475..1945c5f7 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -19,7 +19,7 @@ from yacs.config import CfgNode from deepspeech.frontend.utility import read_manifest from deepspeech.utils.log import Log -__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"] +__all__ = ["ManifestDataset", "TransformDataset"] logger = Log(__name__).getlog() @@ -107,21 +107,7 @@ class ManifestDataset(Dataset): return len(self._manifest) def __getitem__(self, idx): - instance = self._manifest[idx] - return instance["utt"], instance["feat"], instance["text"] - - -class TripletManifestDataset(ManifestDataset): - """ - For Joint Training of Speech Translation and ASR. - text: translation, - text1: transcript. - """ - - def __getitem__(self, idx): - instance = self._manifest[idx] - return instance["utt"], instance["feat"], instance["text"], instance[ - "text1"] + return self._manifest[idx] class TransformDataset(Dataset): @@ -273,5 +259,4 @@ class AudioDataset(Dataset): return len(self.minibatch) def __getitem__(self, idx): - instance = self.minibatch[idx] - return instance["utt"], instance["feat"], instance["text"] + return self.minibatch[idx] diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py index 30ae98f0..e7c43a78 100644 --- a/deepspeech/io/reader.py +++ b/deepspeech/io/reader.py @@ -322,7 +322,7 @@ class LoadInputsAndTargets(): "Not supported: loader_type={}".format(filetype)) def file_type(self, filepath): - suffix = filepath.split(":")[0].split('.')[1] + suffix = filepath.split(":")[0].split('.')[-1] if suffix == 'ark': return 'mat' elif suffix == 'scp': diff --git a/docs/src/data_preparation.md b/docs/src/data_preparation.md index a3d1b3eb..34d2a835 100644 --- a/docs/src/data_preparation.md +++ b/docs/src/data_preparation.md @@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w ```bash python3 utils/compute_mean_std.py \ --num_samples 2000 \ ---specgram_type linear \ +--spectrum_type linear \ --manifest_path examples/librispeech/data/manifest.train \ --output_path examples/librispeech/data/mean_std.npz ``` diff --git a/docs/src/deepspeech_architecture.md b/docs/src/deepspeech_architecture.md index b9344122..5a6ca886 100644 --- a/docs/src/deepspeech_architecture.md +++ b/docs/src/deepspeech_architecture.md @@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute cd examples/aishell/s0 python3 ../../../utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --stride_ms=10.0 \ --window_ms=20.0 \ diff --git a/examples/1xt2x/aishell/conf/deepspeech2.yaml b/examples/1xt2x/aishell/conf/deepspeech2.yaml index 6e745e9d..c2d69226 100644 --- a/examples/1xt2x/aishell/conf/deepspeech2.yaml +++ b/examples/1xt2x/aishell/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml index fbc7466f..be51a9b9 100644 --- a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml +++ b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/1xt2x/librispeech/conf/deepspeech2.yaml index edef0797..ad7fb2c1 100644 --- a/examples/1xt2x/librispeech/conf/deepspeech2.yaml +++ b/examples/1xt2x/librispeech/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 9560930a..ffefaeb3 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml index 7e87594c..cac599dc 100644 --- a/examples/aishell/s0/conf/deepspeech2_online.yaml +++ b/examples/aishell/s0/conf/deepspeech2_online.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear #linear, mfcc, fbank + spectrum_type: linear #linear, mfcc, fbank feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index b106f3f2..1312a12f 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --stride_ms=10.0 \ --window_ms=20.0 \ diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 6f8ae135..9b563da2 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index a4248459..dfa9a4b0 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index 8d5ac4d5..c05c3ea2 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --stride_ms=10.0 \ diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/s1/conf/chunk_conformer.yaml index f79b8eaa..a853658a 100644 --- a/examples/callcenter/s1/conf/chunk_conformer.yaml +++ b/examples/callcenter/s1/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/s1/conf/conformer.yaml index 3b08cc7a..bd4f4578 100644 --- a/examples/callcenter/s1/conf/conformer.yaml +++ b/examples/callcenter/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh index e2640ead..b2a495b4 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/s1/local/data.sh @@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --stride_ms=10.0 \ diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index 3f1a376f..47ef9421 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear target_sample_rate: 16000 max_freq: None n_fft: None diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/s0/conf/deepspeech2_online.yaml index 180a6205..e2f91094 100644 --- a/examples/librispeech/s0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear target_sample_rate: 16000 max_freq: None n_fft: None diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index b7180986..e3f7b325 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=2000 \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ --stride_ms=10.0 \ diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml index 92db20f6..872b560b 100644 --- a/examples/librispeech/s1/conf/chunk_conformer.yaml +++ b/examples/librispeech/s1/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 16 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index e0bc3135..132a4f9d 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 78be249c..769ed5f5 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index e4a06767..c9dc1413 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index 4ad476d3..2b6af229 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml index 92db20f6..872b560b 100644 --- a/examples/librispeech/s2/conf/chunk_conformer.yaml +++ b/examples/librispeech/s2/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 16 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml index e0bc3135..132a4f9d 100644 --- a/examples/librispeech/s2/conf/chunk_transformer.yaml +++ b/examples/librispeech/s2/conf/chunk_transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml index 9a727413..bc87466e 100644 --- a/examples/librispeech/s2/conf/conformer.yaml +++ b/examples/librispeech/s2/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 16 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/s2/local/data.sh index 4ad476d3..2b6af229 100755 --- a/examples/librispeech/s2/local/data.sh +++ b/examples/librispeech/s2/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml index 1aad86d2..8c03e328 100644 --- a/examples/ted_en_zh/t0/conf/transformer.yaml +++ b/examples/ted_en_zh/t0/conf/transformer.yaml @@ -18,7 +18,7 @@ collator: # augmentation_config: conf/augmentation.json batch_size: 10 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml index 0144c40d..cbfae93e 100644 --- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml +++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml @@ -18,7 +18,7 @@ collator: # augmentation_config: conf/augmentation.json batch_size: 10 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index 32cfd9d7..43911c34 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml index c3b51996..1ae9acd0 100644 --- a/examples/timit/s1/conf/transformer.yaml +++ b/examples/timit/s1/conf/transformer.yaml @@ -17,7 +17,7 @@ collator: augmentation_config: "" batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh index 1d16f454..f4be9048 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/s1/local/data.sh @@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 40899655..a7940cb2 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/s0/conf/deepspeech2_online.yaml index 0098a226..7e30409f 100644 --- a/examples/tiny/s0/conf/deepspeech2_online.yaml +++ b/examples/tiny/s0/conf/deepspeech2_online.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh index 02fdb706..fabf2e40 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/s0/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ --num_samples=64 \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ --stride_ms=10.0 \ diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index be2e82f9..f3c7e1dd 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index 93439a85..83005754 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 9bb67c44..628e3b77 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index fcbe1da4..27ffcae4 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh index 2aea250b..b5dbd581 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/s1/local/data.sh @@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ --num_samples=64 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py index a468153d..0f63715a 100755 --- a/utils/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('num_samples', int, 2000, "# of samples to for statistics.") -add_arg('specgram_type', str, +add_arg('spectrum_type', str, 'linear', "Audio feature type. Options: linear, mfcc, fbank.", choices=['linear', 'mfcc', 'fbank']) @@ -58,7 +58,7 @@ def main(): augmentation_pipeline = AugmentationPipeline('{}') audio_featurizer = AudioFeaturizer( - specgram_type=args.specgram_type, + spectrum_type=args.spectrum_type, feat_dim=args.feat_dim, delta_delta=args.delta_delta, stride_ms=args.stride_ms,