test refactor collator

pull/865/head
Hui Zhang 3 years ago
parent f628e21816
commit b7b1bda34f

@ -31,7 +31,6 @@ from yacs.config import CfgNode
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.collator import TripletSpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.io.dataset import TripletManifestDataset
from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2_st import U2STModel
@ -249,12 +248,11 @@ class U2STTrainer(Trainer):
config.collator.keep_transcription_text = False
# train/valid dataset, return token ids
Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset
config.data.manifest = config.data.train_manifest
train_dataset = Dataset.from_config(config)
train_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.dev_manifest
dev_dataset = Dataset.from_config(config)
dev_dataset = ManifestDataset.from_config(config)
if config.model.model_conf.asr_weight > 0.:
Collator = TripletSpeechCollator

@ -24,15 +24,15 @@ class AudioFeaturizer():
Currently, it supports feature types of linear spectrogram and mfcc.
:param specgram_type: Specgram feature type. Options: 'linear'.
:type specgram_type: str
:param spectrum_type: Specgram feature type. Options: 'linear'.
:type spectrum_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When specgram_type is 'linear', only FFT bins
:param max_freq: When spectrum_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when specgram_type is 'mfcc', max_feq is the
returned; when spectrum_type is 'mfcc', max_feq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Audio are resampled (if upsampling or
@ -47,7 +47,7 @@ class AudioFeaturizer():
"""
def __init__(self,
specgram_type: str='linear',
spectrum_type: str='linear',
feat_dim: int=None,
delta_delta: bool=False,
stride_ms=10.0,
@ -58,7 +58,7 @@ class AudioFeaturizer():
use_dB_normalization=True,
target_dB=-20,
dither=1.0):
self._specgram_type = specgram_type
self._spectrum_type = spectrum_type
# mfcc and fbank using `feat_dim`
self._feat_dim = feat_dim
# mfcc and fbank using `delta-delta`
@ -113,27 +113,27 @@ class AudioFeaturizer():
def feature_size(self):
"""audio feature size"""
feat_dim = 0
if self._specgram_type == 'linear':
if self._spectrum_type == 'linear':
fft_point = self._window_ms if self._fft_point is None else self._fft_point
feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
1)
elif self._specgram_type == 'mfcc':
elif self._spectrum_type == 'mfcc':
# mfcc, delta, delta-delta
feat_dim = int(self._feat_dim *
3) if self._delta_delta else int(self._feat_dim)
elif self._specgram_type == 'fbank':
elif self._spectrum_type == 'fbank':
# fbank, delta, delta-delta
feat_dim = int(self._feat_dim *
3) if self._delta_delta else int(self._feat_dim)
else:
raise ValueError("Unknown specgram_type %s. "
"Supported values: linear." % self._specgram_type)
raise ValueError("Unknown spectrum_type %s. "
"Supported values: linear." % self._spectrum_type)
return feat_dim
def _compute_specgram(self, audio_segment):
"""Extract various audio features."""
sample_rate = audio_segment.sample_rate
if self._specgram_type == 'linear':
if self._spectrum_type == 'linear':
samples = audio_segment.samples
return self._compute_linear_specgram(
samples,
@ -141,7 +141,7 @@ class AudioFeaturizer():
stride_ms=self._stride_ms,
window_ms=self._window_ms,
max_freq=self._max_freq)
elif self._specgram_type == 'mfcc':
elif self._spectrum_type == 'mfcc':
samples = audio_segment.to('int16')
return self._compute_mfcc(
samples,
@ -152,7 +152,7 @@ class AudioFeaturizer():
max_freq=self._max_freq,
dither=self._dither,
delta_delta=self._delta_delta)
elif self._specgram_type == 'fbank':
elif self._spectrum_type == 'fbank':
samples = audio_segment.to('int16')
return self._compute_fbank(
samples,
@ -164,8 +164,8 @@ class AudioFeaturizer():
dither=self._dither,
delta_delta=self._delta_delta)
else:
raise ValueError("Unknown specgram_type %s. "
"Supported values: linear." % self._specgram_type)
raise ValueError("Unknown spectrum_type %s. "
"Supported values: linear." % self._spectrum_type)
def _specgram_real(self, samples, window_size, stride_size, sample_rate):
"""Compute the spectrogram for samples from a real signal."""

@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
class SpeechFeaturizer():
"""Speech featurizer, for extracting features from both audio and transcript
contents of SpeechSegment.
Currently, for audio parts, it supports feature types of linear
spectrogram and mfcc; for transcript parts, it only supports char-level
tokenizing and conversion into a list of token indices. Note that the
token indexing order follows the given vocabulary file.
:param vocab_filepath: Filepath to load vocabulary for token indices
conversion.
:type specgram_type: str
:param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
:type specgram_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When specgram_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when specgram_type is 'mfcc', max_freq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Speech are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
:type target_dB: float
"""Speech and Text feature extraction.
"""
def __init__(self,
unit_type,
vocab_filepath,
spm_model_prefix=None,
specgram_type='linear',
spectrum_type='linear',
feat_dim=None,
delta_delta=False,
stride_ms=10.0,
@ -70,7 +40,7 @@ class SpeechFeaturizer():
self.window_ms = window_ms
self.audio_feature = AudioFeaturizer(
specgram_type=specgram_type,
spectrum_type=spectrum_type,
feat_dim=feat_dim,
delta_delta=delta_delta,
stride_ms=stride_ms,

@ -15,6 +15,7 @@
import json
import math
import tarfile
from collections import namedtuple
from typing import List
from typing import Optional
from typing import Text

@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
logger = Log(__name__).getlog()
def tokenids(text, keep_transcription_text):
# for training text is token ids
tokens = text # token ids
if keep_transcription_text:
# text is string, convert to unicode ord
assert isinstance(text, str), (type(text), text)
tokens = [ord(t) for t in text]
tokens = np.array(tokens, dtype=np.int64)
return tokens
class SpeechCollatorBase():
def __init__(
self,
@ -150,7 +163,6 @@ class SpeechCollatorBase():
# extract speech feature
spectrum, transcript_part = self._speech_featurizer.featurize(
speech_segment, self.keep_transcription_text)
# CMVN spectrum
if self._normalizer:
spectrum = self._normalizer.apply(spectrum)
@ -163,38 +175,35 @@ class SpeechCollatorBase():
"""batch examples
Args:
batch ([List]): batch is (audio, text)
batch (List[Dict]): batch is [dict(audio, text, ...)]
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
Returns:
tuple(audio, text, audio_lens, text_lens): batched data.
audio : (B, Tmax, D)
audio_lens: (B)
text : (B, Umax)
text_lens: (B)
tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
utts: (B,)
xs_pad : (B, Tmax, D)
ilens: (B,)
ys_pad : (B, Umax)
olens: (B,)
"""
audios = []
audio_lens = []
texts = []
text_lens = []
utts = []
for utt, audio, text in batch:
for idx, item in enumerate(batch):
utts.append(item['utt'])
audio = item['feat']
text = item['text']
audio, text = self.process_utterance(audio, text)
#utt
utts.append(utt)
# audio
audios.append(audio) # [T, D]
audio_lens.append(audio.shape[0])
# text
# for training, text is token ids, else text is string, convert to unicode ord
tokens = []
if self.keep_transcription_text:
assert isinstance(text, str), (type(text), text)
tokens = [ord(t) for t in text]
else:
tokens = text # token ids
tokens = np.array(tokens, dtype=np.int64)
tokens = tokenids(text, self.keep_transcription_text)
texts.append(tokens)
text_lens.append(tokens.shape[0])
@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator):
"""batch examples
Args:
batch ([List]): batch is (audio, text)
batch (List[Dict]): batch is [dict(audio, text, ...)]
audio (np.ndarray) shape (T, D)
text (List[int] or str): shape (U,)
Returns:
tuple(audio, text, audio_lens, text_lens): batched data.
audio : (B, Tmax, D)
audio_lens: (B)
text : (B, Umax)
text_lens: (B)
tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
utts: (B,)
xs_pad : (B, Tmax, D)
ilens: (B,)
ys_pad : [(B, Umax), (B, Umax)]
olens: [(B,), (B,)]
"""
utts = []
audios = []
audio_lens = []
translation_text = []
@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator):
transcription_text = []
transcription_text_lens = []
utts = []
for utt, audio, translation, transcription in batch:
for idx, item in enumerate(batch):
utts.append(item['utt'])
audio = item['feat']
translation = item['text']
transcription = item['text1']
audio, translation, transcription = self.process_utterance(
audio, translation, transcription)
#utt
utts.append(utt)
# audio
audios.append(audio) # [T, D]
audio_lens.append(audio.shape[0])
# text
# for training, text is token ids
# else text is string, convert to unicode ord
tokens = [[], []]
for idx, text in enumerate([translation, transcription]):
if self.keep_transcription_text:
assert isinstance(text, str), (type(text), text)
tokens[idx] = [ord(t) for t in text]
else:
tokens[idx] = text # token ids
tokens[idx] = np.array(tokens[idx], dtype=np.int64)
tokens[idx] = tokenids(text, self.keep_transcription_text)
translation_text.append(tokens[0])
translation_text_lens.append(tokens[0].shape[0])
transcription_text.append(tokens[1])
transcription_text_lens.append(tokens[1].shape[0])
padded_audios = pad_sequence(
audios, padding_value=0.0).astype(np.float32) #[B, T, D]
audio_lens = np.array(audio_lens).astype(np.int64)
padded_translation = pad_sequence(
translation_text, padding_value=IGNORE_ID).astype(np.int64)
xs_pad = pad_list(audios, 0.0).astype(np.float32) #[B, T, D]
ilens = np.array(audio_lens).astype(np.int64)
padded_translation = pad_list(translation_text,
IGNORE_ID).astype(np.int64)
translation_lens = np.array(translation_text_lens).astype(np.int64)
padded_transcription = pad_sequence(
transcription_text, padding_value=IGNORE_ID).astype(np.int64)
padded_transcription = pad_list(transcription_text,
IGNORE_ID).astype(np.int64)
transcription_lens = np.array(transcription_text_lens).astype(np.int64)
return utts, padded_audios, audio_lens, (
padded_translation, padded_transcription), (translation_lens,
transcription_lens)
ys_pad = (padded_translation, padded_transcription)
olens = (translation_lens, transcription_lens)
return utts, xs_pad, ilens, ys_pad, olens

@ -19,7 +19,7 @@ from yacs.config import CfgNode
from deepspeech.frontend.utility import read_manifest
from deepspeech.utils.log import Log
__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
__all__ = ["ManifestDataset", "TransformDataset"]
logger = Log(__name__).getlog()
@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
return len(self._manifest)
def __getitem__(self, idx):
instance = self._manifest[idx]
return instance["utt"], instance["feat"], instance["text"]
class TripletManifestDataset(ManifestDataset):
"""
For Joint Training of Speech Translation and ASR.
text: translation,
text1: transcript.
"""
def __getitem__(self, idx):
instance = self._manifest[idx]
return instance["utt"], instance["feat"], instance["text"], instance[
"text1"]
return self._manifest[idx]
class TransformDataset(Dataset):
@ -273,5 +259,4 @@ class AudioDataset(Dataset):
return len(self.minibatch)
def __getitem__(self, idx):
instance = self.minibatch[idx]
return instance["utt"], instance["feat"], instance["text"]
return self.minibatch[idx]

@ -322,7 +322,7 @@ class LoadInputsAndTargets():
"Not supported: loader_type={}".format(filetype))
def file_type(self, filepath):
suffix = filepath.split(":")[0].split('.')[1]
suffix = filepath.split(":")[0].split('.')[-1]
if suffix == 'ark':
return 'mat'
elif suffix == 'scp':

@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
```bash
python3 utils/compute_mean_std.py \
--num_samples 2000 \
--specgram_type linear \
--spectrum_type linear \
--manifest_path examples/librispeech/data/manifest.train \
--output_path examples/librispeech/data/mean_std.npz
```

@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
cd examples/aishell/s0
python3 ../../../utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--specgram_type="linear" \
--spectrum_type="linear" \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=20.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear
spectrum_type: linear
feat_dim:
delta_delta: False
stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear
spectrum_type: linear
feat_dim:
delta_delta: False
stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear
spectrum_type: linear
feat_dim:
delta_delta: False
stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear
spectrum_type: linear
feat_dim:
delta_delta: False
stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear #linear, mfcc, fbank
spectrum_type: linear #linear, mfcc, fbank
feat_dim:
delta_delta: False
stride_ms: 10.0

@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--specgram_type="linear" \
--spectrum_type="linear" \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=20.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--specgram_type="fbank" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--specgram_type="fbank" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear
spectrum_type: linear
target_sample_rate: 16000
max_freq: None
n_fft: None

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear
spectrum_type: linear
target_sample_rate: 16000
max_freq: None
n_fft: None

@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--num_samples=2000 \
--specgram_type="linear" \
--spectrum_type="linear" \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--num_samples=-1 \
--specgram_type="fbank" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--num_samples=-1 \
--specgram_type="fbank" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \

@ -18,7 +18,7 @@ collator:
# augmentation_config: conf/augmentation.json
batch_size: 10
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
# augmentation_config: conf/augmentation.json
batch_size: 10
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--num_samples=-1 \
--specgram_type="fbank" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \

@ -17,7 +17,7 @@ collator:
augmentation_config: ""
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--num_samples=-1 \
--specgram_type="fbank" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear
spectrum_type: linear
feat_dim:
delta_delta: False
stride_ms: 10.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
random_seed: 0
spm_model_prefix:
specgram_type: linear
spectrum_type: linear
feat_dim:
delta_delta: False
stride_ms: 10.0

@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \
--specgram_type="linear" \
--spectrum_type="linear" \
--delta_delta=false \
--sample_rate=16000 \
--stride_ms=10.0 \

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -18,7 +18,7 @@ collator:
augmentation_config: conf/augmentation.json
batch_size: 4
raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0

@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \
--specgram_type="fbank" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--sample_rate=16000 \

@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('num_samples', int, 2000, "# of samples to for statistics.")
add_arg('specgram_type', str,
add_arg('spectrum_type', str,
'linear',
"Audio feature type. Options: linear, mfcc, fbank.",
choices=['linear', 'mfcc', 'fbank'])
@ -58,7 +58,7 @@ def main():
augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(
specgram_type=args.specgram_type,
spectrum_type=args.spectrum_type,
feat_dim=args.feat_dim,
delta_delta=args.delta_delta,
stride_ms=args.stride_ms,

Loading…
Cancel
Save