test refactor collator

pull/865/head
Hui Zhang 3 years ago
parent f628e21816
commit b7b1bda34f

@ -31,7 +31,6 @@ from yacs.config import CfgNode
from deepspeech.io.collator import SpeechCollator from deepspeech.io.collator import SpeechCollator
from deepspeech.io.collator import TripletSpeechCollator from deepspeech.io.collator import TripletSpeechCollator
from deepspeech.io.dataset import ManifestDataset from deepspeech.io.dataset import ManifestDataset
from deepspeech.io.dataset import TripletManifestDataset
from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2_st import U2STModel from deepspeech.models.u2_st import U2STModel
@ -249,12 +248,11 @@ class U2STTrainer(Trainer):
config.collator.keep_transcription_text = False config.collator.keep_transcription_text = False
# train/valid dataset, return token ids # train/valid dataset, return token ids
Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset
config.data.manifest = config.data.train_manifest config.data.manifest = config.data.train_manifest
train_dataset = Dataset.from_config(config) train_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.dev_manifest config.data.manifest = config.data.dev_manifest
dev_dataset = Dataset.from_config(config) dev_dataset = ManifestDataset.from_config(config)
if config.model.model_conf.asr_weight > 0.: if config.model.model_conf.asr_weight > 0.:
Collator = TripletSpeechCollator Collator = TripletSpeechCollator

@ -24,15 +24,15 @@ class AudioFeaturizer():
Currently, it supports feature types of linear spectrogram and mfcc. Currently, it supports feature types of linear spectrogram and mfcc.
:param specgram_type: Specgram feature type. Options: 'linear'. :param spectrum_type: Specgram feature type. Options: 'linear'.
:type specgram_type: str :type spectrum_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames. :param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float :type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames. :param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float :type window_ms: float
:param max_freq: When specgram_type is 'linear', only FFT bins :param max_freq: When spectrum_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are corresponding to frequencies between [0, max_freq] are
returned; when specgram_type is 'mfcc', max_feq is the returned; when spectrum_type is 'mfcc', max_feq is the
highest band edge of mel filters. highest band edge of mel filters.
:types max_freq: None|float :types max_freq: None|float
:param target_sample_rate: Audio are resampled (if upsampling or :param target_sample_rate: Audio are resampled (if upsampling or
@ -47,7 +47,7 @@ class AudioFeaturizer():
""" """
def __init__(self, def __init__(self,
specgram_type: str='linear', spectrum_type: str='linear',
feat_dim: int=None, feat_dim: int=None,
delta_delta: bool=False, delta_delta: bool=False,
stride_ms=10.0, stride_ms=10.0,
@ -58,7 +58,7 @@ class AudioFeaturizer():
use_dB_normalization=True, use_dB_normalization=True,
target_dB=-20, target_dB=-20,
dither=1.0): dither=1.0):
self._specgram_type = specgram_type self._spectrum_type = spectrum_type
# mfcc and fbank using `feat_dim` # mfcc and fbank using `feat_dim`
self._feat_dim = feat_dim self._feat_dim = feat_dim
# mfcc and fbank using `delta-delta` # mfcc and fbank using `delta-delta`
@ -113,27 +113,27 @@ class AudioFeaturizer():
def feature_size(self): def feature_size(self):
"""audio feature size""" """audio feature size"""
feat_dim = 0 feat_dim = 0
if self._specgram_type == 'linear': if self._spectrum_type == 'linear':
fft_point = self._window_ms if self._fft_point is None else self._fft_point fft_point = self._window_ms if self._fft_point is None else self._fft_point
feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 + feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
1) 1)
elif self._specgram_type == 'mfcc': elif self._spectrum_type == 'mfcc':
# mfcc, delta, delta-delta # mfcc, delta, delta-delta
feat_dim = int(self._feat_dim * feat_dim = int(self._feat_dim *
3) if self._delta_delta else int(self._feat_dim) 3) if self._delta_delta else int(self._feat_dim)
elif self._specgram_type == 'fbank': elif self._spectrum_type == 'fbank':
# fbank, delta, delta-delta # fbank, delta, delta-delta
feat_dim = int(self._feat_dim * feat_dim = int(self._feat_dim *
3) if self._delta_delta else int(self._feat_dim) 3) if self._delta_delta else int(self._feat_dim)
else: else:
raise ValueError("Unknown specgram_type %s. " raise ValueError("Unknown spectrum_type %s. "
"Supported values: linear." % self._specgram_type) "Supported values: linear." % self._spectrum_type)
return feat_dim return feat_dim
def _compute_specgram(self, audio_segment): def _compute_specgram(self, audio_segment):
"""Extract various audio features.""" """Extract various audio features."""
sample_rate = audio_segment.sample_rate sample_rate = audio_segment.sample_rate
if self._specgram_type == 'linear': if self._spectrum_type == 'linear':
samples = audio_segment.samples samples = audio_segment.samples
return self._compute_linear_specgram( return self._compute_linear_specgram(
samples, samples,
@ -141,7 +141,7 @@ class AudioFeaturizer():
stride_ms=self._stride_ms, stride_ms=self._stride_ms,
window_ms=self._window_ms, window_ms=self._window_ms,
max_freq=self._max_freq) max_freq=self._max_freq)
elif self._specgram_type == 'mfcc': elif self._spectrum_type == 'mfcc':
samples = audio_segment.to('int16') samples = audio_segment.to('int16')
return self._compute_mfcc( return self._compute_mfcc(
samples, samples,
@ -152,7 +152,7 @@ class AudioFeaturizer():
max_freq=self._max_freq, max_freq=self._max_freq,
dither=self._dither, dither=self._dither,
delta_delta=self._delta_delta) delta_delta=self._delta_delta)
elif self._specgram_type == 'fbank': elif self._spectrum_type == 'fbank':
samples = audio_segment.to('int16') samples = audio_segment.to('int16')
return self._compute_fbank( return self._compute_fbank(
samples, samples,
@ -164,8 +164,8 @@ class AudioFeaturizer():
dither=self._dither, dither=self._dither,
delta_delta=self._delta_delta) delta_delta=self._delta_delta)
else: else:
raise ValueError("Unknown specgram_type %s. " raise ValueError("Unknown spectrum_type %s. "
"Supported values: linear." % self._specgram_type) "Supported values: linear." % self._spectrum_type)
def _specgram_real(self, samples, window_size, stride_size, sample_rate): def _specgram_real(self, samples, window_size, stride_size, sample_rate):
"""Compute the spectrogram for samples from a real signal.""" """Compute the spectrogram for samples from a real signal."""

@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
class SpeechFeaturizer(): class SpeechFeaturizer():
"""Speech featurizer, for extracting features from both audio and transcript """Speech and Text feature extraction.
contents of SpeechSegment.
Currently, for audio parts, it supports feature types of linear
spectrogram and mfcc; for transcript parts, it only supports char-level
tokenizing and conversion into a list of token indices. Note that the
token indexing order follows the given vocabulary file.
:param vocab_filepath: Filepath to load vocabulary for token indices
conversion.
:type specgram_type: str
:param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
:type specgram_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When specgram_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when specgram_type is 'mfcc', max_freq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Speech are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
:type target_dB: float
""" """
def __init__(self, def __init__(self,
unit_type, unit_type,
vocab_filepath, vocab_filepath,
spm_model_prefix=None, spm_model_prefix=None,
specgram_type='linear', spectrum_type='linear',
feat_dim=None, feat_dim=None,
delta_delta=False, delta_delta=False,
stride_ms=10.0, stride_ms=10.0,
@ -70,7 +40,7 @@ class SpeechFeaturizer():
self.window_ms = window_ms self.window_ms = window_ms
self.audio_feature = AudioFeaturizer( self.audio_feature = AudioFeaturizer(
specgram_type=specgram_type, spectrum_type=spectrum_type,
feat_dim=feat_dim, feat_dim=feat_dim,
delta_delta=delta_delta, delta_delta=delta_delta,
stride_ms=stride_ms, stride_ms=stride_ms,

@ -15,6 +15,7 @@
import json import json
import math import math
import tarfile import tarfile
from collections import namedtuple
from typing import List from typing import List
from typing import Optional from typing import Optional
from typing import Text from typing import Text

@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
def tokenids(text, keep_transcription_text):
# for training text is token ids
tokens = text # token ids
if keep_transcription_text:
# text is string, convert to unicode ord
assert isinstance(text, str), (type(text), text)
tokens = [ord(t) for t in text]
tokens = np.array(tokens, dtype=np.int64)
return tokens
class SpeechCollatorBase(): class SpeechCollatorBase():
def __init__( def __init__(
self, self,
@ -150,7 +163,6 @@ class SpeechCollatorBase():
# extract speech feature # extract speech feature
spectrum, transcript_part = self._speech_featurizer.featurize( spectrum, transcript_part = self._speech_featurizer.featurize(
speech_segment, self.keep_transcription_text) speech_segment, self.keep_transcription_text)
# CMVN spectrum # CMVN spectrum
if self._normalizer: if self._normalizer:
spectrum = self._normalizer.apply(spectrum) spectrum = self._normalizer.apply(spectrum)
@ -163,38 +175,35 @@ class SpeechCollatorBase():
"""batch examples """batch examples
Args: Args:
batch ([List]): batch is (audio, text) batch (List[Dict]): batch is [dict(audio, text, ...)]
audio (np.ndarray) shape (T, D) audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,) text (List[int] or str): shape (U,)
Returns: Returns:
tuple(audio, text, audio_lens, text_lens): batched data. tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
audio : (B, Tmax, D) utts: (B,)
audio_lens: (B) xs_pad : (B, Tmax, D)
text : (B, Umax) ilens: (B,)
text_lens: (B) ys_pad : (B, Umax)
olens: (B,)
""" """
audios = [] audios = []
audio_lens = [] audio_lens = []
texts = [] texts = []
text_lens = [] text_lens = []
utts = [] utts = []
for utt, audio, text in batch:
for idx, item in enumerate(batch):
utts.append(item['utt'])
audio = item['feat']
text = item['text']
audio, text = self.process_utterance(audio, text) audio, text = self.process_utterance(audio, text)
#utt
utts.append(utt)
# audio
audios.append(audio) # [T, D] audios.append(audio) # [T, D]
audio_lens.append(audio.shape[0]) audio_lens.append(audio.shape[0])
# text
# for training, text is token ids, else text is string, convert to unicode ord tokens = tokenids(text, self.keep_transcription_text)
tokens = []
if self.keep_transcription_text:
assert isinstance(text, str), (type(text), text)
tokens = [ord(t) for t in text]
else:
tokens = text # token ids
tokens = np.array(tokens, dtype=np.int64)
texts.append(tokens) texts.append(tokens)
text_lens.append(tokens.shape[0]) text_lens.append(tokens.shape[0])
@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator):
"""batch examples """batch examples
Args: Args:
batch ([List]): batch is (audio, text) batch (List[Dict]): batch is [dict(audio, text, ...)]
audio (np.ndarray) shape (T, D) audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,) text (List[int] or str): shape (U,)
Returns: Returns:
tuple(audio, text, audio_lens, text_lens): batched data. tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
audio : (B, Tmax, D) utts: (B,)
audio_lens: (B) xs_pad : (B, Tmax, D)
text : (B, Umax) ilens: (B,)
text_lens: (B) ys_pad : [(B, Umax), (B, Umax)]
olens: [(B,), (B,)]
""" """
utts = []
audios = [] audios = []
audio_lens = [] audio_lens = []
translation_text = [] translation_text = []
@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator):
transcription_text = [] transcription_text = []
transcription_text_lens = [] transcription_text_lens = []
utts = [] for idx, item in enumerate(batch):
for utt, audio, translation, transcription in batch: utts.append(item['utt'])
audio = item['feat']
translation = item['text']
transcription = item['text1']
audio, translation, transcription = self.process_utterance( audio, translation, transcription = self.process_utterance(
audio, translation, transcription) audio, translation, transcription)
#utt
utts.append(utt)
# audio
audios.append(audio) # [T, D] audios.append(audio) # [T, D]
audio_lens.append(audio.shape[0]) audio_lens.append(audio.shape[0])
# text
# for training, text is token ids
# else text is string, convert to unicode ord
tokens = [[], []] tokens = [[], []]
for idx, text in enumerate([translation, transcription]): for idx, text in enumerate([translation, transcription]):
if self.keep_transcription_text: tokens[idx] = tokenids(text, self.keep_transcription_text)
assert isinstance(text, str), (type(text), text)
tokens[idx] = [ord(t) for t in text]
else:
tokens[idx] = text # token ids
tokens[idx] = np.array(tokens[idx], dtype=np.int64)
translation_text.append(tokens[0]) translation_text.append(tokens[0])
translation_text_lens.append(tokens[0].shape[0]) translation_text_lens.append(tokens[0].shape[0])
transcription_text.append(tokens[1]) transcription_text.append(tokens[1])
transcription_text_lens.append(tokens[1].shape[0]) transcription_text_lens.append(tokens[1].shape[0])
padded_audios = pad_sequence( xs_pad = pad_list(audios, 0.0).astype(np.float32) #[B, T, D]
audios, padding_value=0.0).astype(np.float32) #[B, T, D] ilens = np.array(audio_lens).astype(np.int64)
audio_lens = np.array(audio_lens).astype(np.int64)
padded_translation = pad_sequence( padded_translation = pad_list(translation_text,
translation_text, padding_value=IGNORE_ID).astype(np.int64) IGNORE_ID).astype(np.int64)
translation_lens = np.array(translation_text_lens).astype(np.int64) translation_lens = np.array(translation_text_lens).astype(np.int64)
padded_transcription = pad_sequence(
transcription_text, padding_value=IGNORE_ID).astype(np.int64) padded_transcription = pad_list(transcription_text,
IGNORE_ID).astype(np.int64)
transcription_lens = np.array(transcription_text_lens).astype(np.int64) transcription_lens = np.array(transcription_text_lens).astype(np.int64)
return utts, padded_audios, audio_lens, (
padded_translation, padded_transcription), (translation_lens, ys_pad = (padded_translation, padded_transcription)
transcription_lens) olens = (translation_lens, transcription_lens)
return utts, xs_pad, ilens, ys_pad, olens

@ -19,7 +19,7 @@ from yacs.config import CfgNode
from deepspeech.frontend.utility import read_manifest from deepspeech.frontend.utility import read_manifest
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"] __all__ = ["ManifestDataset", "TransformDataset"]
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
return len(self._manifest) return len(self._manifest)
def __getitem__(self, idx): def __getitem__(self, idx):
instance = self._manifest[idx] return self._manifest[idx]
return instance["utt"], instance["feat"], instance["text"]
class TripletManifestDataset(ManifestDataset):
"""
For Joint Training of Speech Translation and ASR.
text: translation,
text1: transcript.
"""
def __getitem__(self, idx):
instance = self._manifest[idx]
return instance["utt"], instance["feat"], instance["text"], instance[
"text1"]
class TransformDataset(Dataset): class TransformDataset(Dataset):
@ -273,5 +259,4 @@ class AudioDataset(Dataset):
return len(self.minibatch) return len(self.minibatch)
def __getitem__(self, idx): def __getitem__(self, idx):
instance = self.minibatch[idx] return self.minibatch[idx]
return instance["utt"], instance["feat"], instance["text"]

@ -322,7 +322,7 @@ class LoadInputsAndTargets():
"Not supported: loader_type={}".format(filetype)) "Not supported: loader_type={}".format(filetype))
def file_type(self, filepath): def file_type(self, filepath):
suffix = filepath.split(":")[0].split('.')[1] suffix = filepath.split(":")[0].split('.')[-1]
if suffix == 'ark': if suffix == 'ark':
return 'mat' return 'mat'
elif suffix == 'scp': elif suffix == 'scp':

@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
```bash ```bash
python3 utils/compute_mean_std.py \ python3 utils/compute_mean_std.py \
--num_samples 2000 \ --num_samples 2000 \
--specgram_type linear \ --spectrum_type linear \
--manifest_path examples/librispeech/data/manifest.train \ --manifest_path examples/librispeech/data/manifest.train \
--output_path examples/librispeech/data/mean_std.npz --output_path examples/librispeech/data/mean_std.npz
``` ```

@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
cd examples/aishell/s0 cd examples/aishell/s0
python3 ../../../utils/compute_mean_std.py \ python3 ../../../utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--specgram_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10.0 \
--window_ms=20.0 \ --window_ms=20.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
random_seed: 0 random_seed: 0
spm_model_prefix: spm_model_prefix:
specgram_type: linear spectrum_type: linear
feat_dim: feat_dim:
delta_delta: False delta_delta: False
stride_ms: 10.0 stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
random_seed: 0 random_seed: 0
spm_model_prefix: spm_model_prefix:
specgram_type: linear spectrum_type: linear
feat_dim: feat_dim:
delta_delta: False delta_delta: False
stride_ms: 10.0 stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
random_seed: 0 random_seed: 0
spm_model_prefix: spm_model_prefix:
specgram_type: linear spectrum_type: linear
feat_dim: feat_dim:
delta_delta: False delta_delta: False
stride_ms: 10.0 stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
random_seed: 0 random_seed: 0
spm_model_prefix: spm_model_prefix:
specgram_type: linear spectrum_type: linear
feat_dim: feat_dim:
delta_delta: False delta_delta: False
stride_ms: 10.0 stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
random_seed: 0 random_seed: 0
spm_model_prefix: spm_model_prefix:
specgram_type: linear #linear, mfcc, fbank spectrum_type: linear #linear, mfcc, fbank
feat_dim: feat_dim:
delta_delta: False delta_delta: False
stride_ms: 10.0 stride_ms: 10.0

@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers=$(nproc) num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--specgram_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10.0 \
--window_ms=20.0 \ --window_ms=20.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 64 batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers=$(nproc) num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers=$(nproc) num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
random_seed: 0 random_seed: 0
spm_model_prefix: spm_model_prefix:
specgram_type: linear spectrum_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
random_seed: 0 random_seed: 0
spm_model_prefix: spm_model_prefix:
specgram_type: linear spectrum_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None

@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=2000 \ --num_samples=2000 \
--specgram_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
--stride_ms=10.0 \ --stride_ms=10.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 16 batch_size: 16
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 64 batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 32 batch_size: 32
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=-1 \ --num_samples=-1 \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 16 batch_size: 16
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 64 batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 16 batch_size: 16
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=-1 \ --num_samples=-1 \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \

@ -18,7 +18,7 @@ collator:
# augmentation_config: conf/augmentation.json # augmentation_config: conf/augmentation.json
batch_size: 10 batch_size: 10
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
# augmentation_config: conf/augmentation.json # augmentation_config: conf/augmentation.json
batch_size: 10 batch_size: 10
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=-1 \ --num_samples=-1 \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \

@ -17,7 +17,7 @@ collator:
augmentation_config: "" augmentation_config: ""
batch_size: 64 batch_size: 64
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=-1 \ --num_samples=-1 \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
random_seed: 0 random_seed: 0
spm_model_prefix: spm_model_prefix:
specgram_type: linear spectrum_type: linear
feat_dim: feat_dim:
delta_delta: False delta_delta: False
stride_ms: 10.0 stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
random_seed: 0 random_seed: 0
spm_model_prefix: spm_model_prefix:
specgram_type: linear spectrum_type: linear
feat_dim: feat_dim:
delta_delta: False delta_delta: False
stride_ms: 10.0 stride_ms: 10.0

@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \ --manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \ --num_samples=64 \
--specgram_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
--stride_ms=10.0 \ --stride_ms=10.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 4 batch_size: 4
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 4 batch_size: 4
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 4 batch_size: 4
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json augmentation_config: conf/augmentation.json
batch_size: 4 batch_size: 4
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0

@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \ --manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \ --num_samples=64 \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \

@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable # yapf: disable
add_arg('num_samples', int, 2000, "# of samples to for statistics.") add_arg('num_samples', int, 2000, "# of samples to for statistics.")
add_arg('specgram_type', str, add_arg('spectrum_type', str,
'linear', 'linear',
"Audio feature type. Options: linear, mfcc, fbank.", "Audio feature type. Options: linear, mfcc, fbank.",
choices=['linear', 'mfcc', 'fbank']) choices=['linear', 'mfcc', 'fbank'])
@ -58,7 +58,7 @@ def main():
augmentation_pipeline = AugmentationPipeline('{}') augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer( audio_featurizer = AudioFeaturizer(
specgram_type=args.specgram_type, spectrum_type=args.spectrum_type,
feat_dim=args.feat_dim, feat_dim=args.feat_dim,
delta_delta=args.delta_delta, delta_delta=args.delta_delta,
stride_ms=args.stride_ms, stride_ms=args.stride_ms,

Loading…
Cancel
Save