pull/673/head
Haoxin Ma 4 years ago
parent 82ca0f6549
commit aaeef54f46

@ -103,7 +103,7 @@ class AugmentationPipeline():
""" """
for augmentor, rate in zip(self._augmentors, self._rates): for augmentor, rate in zip(self._augmentors, self._rates):
augmentor.randomize_parameters() augmentor.randomize_parameters()
def randomize_parameters_feature_transform(self, n_frames, n_bins): def randomize_parameters_feature_transform(self, n_frames, n_bins):
"""Run the pre-processing pipeline for data augmentation. """Run the pre-processing pipeline for data augmentation.
@ -142,7 +142,7 @@ class AugmentationPipeline():
# """Run the pre-processing pipeline for data augmentation. # """Run the pre-processing pipeline for data augmentation.
# Note that this is an in-place transformation. # Note that this is an in-place transformation.
# :param audio_segment: Audio segment to process. # :param audio_segment: Audio segment to process.
# :type audio_segment: AudioSegmenet|SpeechSegment # :type audio_segment: AudioSegmenet|SpeechSegment
# """ # """
@ -152,7 +152,7 @@ class AugmentationPipeline():
# def transform_feature(self, spec_segment, single=True): # def transform_feature(self, spec_segment, single=True):
# """spectrogram augmentation. # """spectrogram augmentation.
# Args: # Args:
# spec_segment (np.ndarray): audio feature, (D, T). # spec_segment (np.ndarray): audio feature, (D, T).
# """ # """

@ -32,7 +32,8 @@ class ShiftPerturbAugmentor(AugmentorBase):
self._rng = rng self._rng = rng
def randomize_parameters(self): def randomize_parameters(self):
self.shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) self.shift_ms = self._rng.uniform(self._min_shift_ms,
self._max_shift_ms)
def apply(self, audio_segment): def apply(self, audio_segment):
audio_segment.shift(self.shift_ms) audio_segment.shift(self.shift_ms)
@ -49,7 +50,6 @@ class ShiftPerturbAugmentor(AugmentorBase):
# self.randomize_parameters() # self.randomize_parameters()
# self.apply(audio_segment) # self.apply(audio_segment)
# def transform_audio(self, audio_segment): # def transform_audio(self, audio_segment):
# """Shift audio. # """Shift audio.
@ -60,5 +60,3 @@ class ShiftPerturbAugmentor(AugmentorBase):
# """ # """
# shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) # shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
# audio_segment.shift(shift_ms) # audio_segment.shift(shift_ms)

@ -123,18 +123,18 @@ class SpecAugmentor(AugmentorBase):
def time_warp(xs, W=40): def time_warp(xs, W=40):
raise NotImplementedError raise NotImplementedError
def randomize_parameters(self, n_frames, n_bins): def randomize_parameters(self, n_frames, n_bins):
# n_bins = xs.shape[0] # n_bins = xs.shape[0]
# n_frames = xs.shape[1] # n_frames = xs.shape[1]
self.f=[] self.f = []
self.f_0=[] self.f_0 = []
self.t=[] self.t = []
self.t_0=[] self.t_0 = []
for i in range(0, self.n_freq_masks): for i in range(0, self.n_freq_masks):
f=int(self._rng.uniform(low=0, high=self.F)) f = int(self._rng.uniform(low=0, high=self.F))
self.f.append(f) self.f.append(f)
self.f_0.append(int(self._rng.uniform(low=0, high=n_bins - f))) self.f_0.append(int(self._rng.uniform(low=0, high=n_bins - f)))
@ -166,7 +166,7 @@ class SpecAugmentor(AugmentorBase):
f_0 = self.f_0[i] f_0 = self.f_0[i]
xs[:, f_0:f_0 + f] = 0 xs[:, f_0:f_0 + f] = 0
assert f_0 <= f_0 + f assert f_0 <= f_0 + f
for i in range(self.n_masks): for i in range(self.n_masks):
t = self.t[i] t = self.t[i]
t_0 = self.t_0[i] t_0 = self.t_0[i]
@ -174,7 +174,6 @@ class SpecAugmentor(AugmentorBase):
assert t_0 <= t_0 + t assert t_0 <= t_0 + t
return xs return xs
# def mask_freq(self, xs, replace_with_zero=False): # def mask_freq(self, xs, replace_with_zero=False):
# n_bins = xs.shape[0] # n_bins = xs.shape[0]
# for i in range(0, self.n_freq_masks): # for i in range(0, self.n_freq_masks):
@ -208,7 +207,6 @@ class SpecAugmentor(AugmentorBase):
# self._time_mask = (t_0, t_0 + t) # self._time_mask = (t_0, t_0 + t)
# return xs # return xs
# def transform_feature(self, xs: np.ndarray, single=True): # def transform_feature(self, xs: np.ndarray, single=True):
# """ # """
# Args: # Args:

@ -79,7 +79,6 @@ class SpeedPerturbAugmentor(AugmentorBase):
self._rates = np.linspace( self._rates = np.linspace(
self._min_rate, self._max_rate, self._num_rates, endpoint=True) self._min_rate, self._max_rate, self._num_rates, endpoint=True)
def randomize_parameters(self): def randomize_parameters(self):
if self._num_rates < 0: if self._num_rates < 0:
self.speed_rate = self._rng.uniform(self._min_rate, self._max_rate) self.speed_rate = self._rng.uniform(self._min_rate, self._max_rate)
@ -92,8 +91,8 @@ class SpeedPerturbAugmentor(AugmentorBase):
return return
audio_segment.change_speed(speed_rate) audio_segment.change_speed(speed_rate)
def transform_audio(self, audio_segment,single=True): def transform_audio(self, audio_segment, single=True):
"""Sample a new speed rate from the given range and """Sample a new speed rate from the given range and
changes the speed of the given audio clip. changes the speed of the given audio clip.
@ -102,7 +101,7 @@ class SpeedPerturbAugmentor(AugmentorBase):
:param audio_segment: Audio segment to add effects to. :param audio_segment: Audio segment to add effects to.
:type audio_segment: AudioSegment|SpeechSegment :type audio_segment: AudioSegment|SpeechSegment
""" """
if(single): if (single):
self.randomize_parameters() self.randomize_parameters()
self.apply(audio_segment) self.apply(audio_segment)

@ -195,7 +195,7 @@ class AudioFeaturizer(object):
ind = np.where(freqs <= max_freq)[0][-1] + 1 ind = np.where(freqs <= max_freq)[0][-1] + 1
specgram = np.log(specgram[:ind, :] + eps) specgram = np.log(specgram[:ind, :] + eps)
specgram = np.transpose(specgram) #T,D specgram = np.transpose(specgram) #T,D
return specgram return specgram
def _specgram_real(self, samples, window_size, stride_size, sample_rate): def _specgram_real(self, samples, window_size, stride_size, sample_rate):
@ -299,7 +299,7 @@ class AudioFeaturizer(object):
ceplifter=22, ceplifter=22,
useEnergy=True, useEnergy=True,
winfunc='povey') winfunc='povey')
mfcc_feat = np.transpose(mfcc_feat) mfcc_feat = np.transpose(mfcc_feat)
if delta_delta: if delta_delta:
mfcc_feat = self._concat_delta_delta(mfcc_feat) mfcc_feat = self._concat_delta_delta(mfcc_feat)

@ -173,7 +173,6 @@ class SpeechCollator():
self._stride_ms = stride_ms self._stride_ms = stride_ms
self._target_sample_rate = target_sample_rate self._target_sample_rate = target_sample_rate
self._speech_featurizer = SpeechFeaturizer( self._speech_featurizer = SpeechFeaturizer(
unit_type=unit_type, unit_type=unit_type,
@ -229,9 +228,10 @@ class SpeechCollator():
def randomize_audio_parameters(self): def randomize_audio_parameters(self):
self._augmentation_pipeline.randomize_parameters_audio_transform() self._augmentation_pipeline.randomize_parameters_audio_transform()
def randomize_feature_parameters(self, n_frames, n_bins): def randomize_feature_parameters(self, n_frames, n_bins):
self._augmentation_pipeline.randomize_parameters_feature_transform(n_frames, n_bins) self._augmentation_pipeline.randomize_parameters_feature_transform(
n_frames, n_bins)
def process_feature_and_transform(self, audio_file, transcript): def process_feature_and_transform(self, audio_file, transcript):
"""Load, augment, featurize and normalize for speech data. """Load, augment, featurize and normalize for speech data.
@ -252,7 +252,7 @@ class SpeechCollator():
# Spectrum transform # Spectrum transform
specgram, transcript_part = self._speech_featurizer.featurize( specgram, transcript_part = self._speech_featurizer.featurize(
speech_segment, self._keep_transcription_text) speech_segment, self._keep_transcription_text)
if self._normalizer: if self._normalizer:
specgram = self._normalizer.apply(specgram) specgram = self._normalizer.apply(specgram)
@ -261,7 +261,6 @@ class SpeechCollator():
return specgram, transcript_part return specgram, transcript_part
# def process_utterance(self, audio_file, transcript, single=True): # def process_utterance(self, audio_file, transcript, single=True):
# """Load, augment, featurize and normalize for speech data. # """Load, augment, featurize and normalize for speech data.
@ -282,11 +281,10 @@ class SpeechCollator():
# # audio augment # # audio augment
# self._augmentation_pipeline.transform_audio(speech_segment) # self._augmentation_pipeline.transform_audio(speech_segment)
# # Spectrum transform # # Spectrum transform
# specgram, transcript_part = self._speech_featurizer.featurize( # specgram, transcript_part = self._speech_featurizer.featurize(
# speech_segment, self._keep_transcription_text) # speech_segment, self._keep_transcription_text)
# if self._normalizer: # if self._normalizer:
# specgram = self._normalizer.apply(specgram) # specgram = self._normalizer.apply(specgram)
@ -350,14 +348,16 @@ class SpeechCollator():
padded_texts = pad_sequence( padded_texts = pad_sequence(
texts, padding_value=IGNORE_ID).astype(np.int64) texts, padding_value=IGNORE_ID).astype(np.int64)
text_lens = np.array(text_lens).astype(np.int64) text_lens = np.array(text_lens).astype(np.int64)
#spec augment #spec augment
n_bins=padded_audios.shape[2] n_bins = padded_audios.shape[2]
self.randomize_feature_parameters(min(audio_lens), n_bins) self.randomize_feature_parameters(min(audio_lens), n_bins)
for i in range(len(padded_audios)): for i in range(len(padded_audios)):
if not self._randomize_each_batch: if not self._randomize_each_batch:
self.randomize_feature_parameters(audio_lens[i], n_bins) self.randomize_feature_parameters(audio_lens[i], n_bins)
padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i]) padded_audios[
i] = self._augmentation_pipeline.apply_feature_transform(
padded_audios[i])
return utts, padded_audios, audio_lens, padded_texts, text_lens return utts, padded_audios, audio_lens, padded_texts, text_lens

Loading…
Cancel
Save