diff --git a/deepspeech/frontend/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py index ac9957ebe..c9780690c 100644 --- a/deepspeech/frontend/augmentor/augmentation.py +++ b/deepspeech/frontend/augmentor/augmentation.py @@ -103,7 +103,7 @@ class AugmentationPipeline(): """ for augmentor, rate in zip(self._augmentors, self._rates): augmentor.randomize_parameters() - + def randomize_parameters_feature_transform(self, n_frames, n_bins): """Run the pre-processing pipeline for data augmentation. @@ -142,7 +142,7 @@ class AugmentationPipeline(): # """Run the pre-processing pipeline for data augmentation. # Note that this is an in-place transformation. - + # :param audio_segment: Audio segment to process. # :type audio_segment: AudioSegmenet|SpeechSegment # """ @@ -152,7 +152,7 @@ class AugmentationPipeline(): # def transform_feature(self, spec_segment, single=True): # """spectrogram augmentation. - + # Args: # spec_segment (np.ndarray): audio feature, (D, T). # """ diff --git a/deepspeech/frontend/augmentor/shift_perturb.py b/deepspeech/frontend/augmentor/shift_perturb.py index cc91b402b..279151c6e 100644 --- a/deepspeech/frontend/augmentor/shift_perturb.py +++ b/deepspeech/frontend/augmentor/shift_perturb.py @@ -32,7 +32,8 @@ class ShiftPerturbAugmentor(AugmentorBase): self._rng = rng def randomize_parameters(self): - self.shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) + self.shift_ms = self._rng.uniform(self._min_shift_ms, + self._max_shift_ms) def apply(self, audio_segment): audio_segment.shift(self.shift_ms) @@ -49,7 +50,6 @@ class ShiftPerturbAugmentor(AugmentorBase): # self.randomize_parameters() # self.apply(audio_segment) - # def transform_audio(self, audio_segment): # """Shift audio. @@ -60,5 +60,3 @@ class ShiftPerturbAugmentor(AugmentorBase): # """ # shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) # audio_segment.shift(shift_ms) - - diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py index f0e6a5ece..637e632a4 100644 --- a/deepspeech/frontend/augmentor/spec_augment.py +++ b/deepspeech/frontend/augmentor/spec_augment.py @@ -123,18 +123,18 @@ class SpecAugmentor(AugmentorBase): def time_warp(xs, W=40): raise NotImplementedError - + def randomize_parameters(self, n_frames, n_bins): # n_bins = xs.shape[0] # n_frames = xs.shape[1] - self.f=[] - self.f_0=[] - self.t=[] - self.t_0=[] + self.f = [] + self.f_0 = [] + self.t = [] + self.t_0 = [] for i in range(0, self.n_freq_masks): - f=int(self._rng.uniform(low=0, high=self.F)) + f = int(self._rng.uniform(low=0, high=self.F)) self.f.append(f) self.f_0.append(int(self._rng.uniform(low=0, high=n_bins - f))) @@ -166,7 +166,7 @@ class SpecAugmentor(AugmentorBase): f_0 = self.f_0[i] xs[:, f_0:f_0 + f] = 0 assert f_0 <= f_0 + f - + for i in range(self.n_masks): t = self.t[i] t_0 = self.t_0[i] @@ -174,7 +174,6 @@ class SpecAugmentor(AugmentorBase): assert t_0 <= t_0 + t return xs - # def mask_freq(self, xs, replace_with_zero=False): # n_bins = xs.shape[0] # for i in range(0, self.n_freq_masks): @@ -208,7 +207,6 @@ class SpecAugmentor(AugmentorBase): # self._time_mask = (t_0, t_0 + t) # return xs - # def transform_feature(self, xs: np.ndarray, single=True): # """ # Args: diff --git a/deepspeech/frontend/augmentor/speed_perturb.py b/deepspeech/frontend/augmentor/speed_perturb.py index 5c5fd0721..d233ed9b4 100644 --- a/deepspeech/frontend/augmentor/speed_perturb.py +++ b/deepspeech/frontend/augmentor/speed_perturb.py @@ -79,7 +79,6 @@ class SpeedPerturbAugmentor(AugmentorBase): self._rates = np.linspace( self._min_rate, self._max_rate, self._num_rates, endpoint=True) - def randomize_parameters(self): if self._num_rates < 0: self.speed_rate = self._rng.uniform(self._min_rate, self._max_rate) @@ -92,8 +91,8 @@ class SpeedPerturbAugmentor(AugmentorBase): return audio_segment.change_speed(speed_rate) - - def transform_audio(self, audio_segment,single=True): + + def transform_audio(self, audio_segment, single=True): """Sample a new speed rate from the given range and changes the speed of the given audio clip. @@ -102,7 +101,7 @@ class SpeedPerturbAugmentor(AugmentorBase): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegment|SpeechSegment """ - if(single): + if (single): self.randomize_parameters() self.apply(audio_segment) diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py index b537e7335..bf95a05c8 100644 --- a/deepspeech/frontend/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -195,7 +195,7 @@ class AudioFeaturizer(object): ind = np.where(freqs <= max_freq)[0][-1] + 1 specgram = np.log(specgram[:ind, :] + eps) - specgram = np.transpose(specgram) #T,D + specgram = np.transpose(specgram) #T,D return specgram def _specgram_real(self, samples, window_size, stride_size, sample_rate): @@ -299,7 +299,7 @@ class AudioFeaturizer(object): ceplifter=22, useEnergy=True, winfunc='povey') - + mfcc_feat = np.transpose(mfcc_feat) if delta_delta: mfcc_feat = self._concat_delta_delta(mfcc_feat) diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 514dc2cc3..7510dee04 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -173,7 +173,6 @@ class SpeechCollator(): self._stride_ms = stride_ms self._target_sample_rate = target_sample_rate - self._speech_featurizer = SpeechFeaturizer( unit_type=unit_type, @@ -229,9 +228,10 @@ class SpeechCollator(): def randomize_audio_parameters(self): self._augmentation_pipeline.randomize_parameters_audio_transform() - + def randomize_feature_parameters(self, n_frames, n_bins): - self._augmentation_pipeline.randomize_parameters_feature_transform(n_frames, n_bins) + self._augmentation_pipeline.randomize_parameters_feature_transform( + n_frames, n_bins) def process_feature_and_transform(self, audio_file, transcript): """Load, augment, featurize and normalize for speech data. @@ -252,7 +252,7 @@ class SpeechCollator(): # Spectrum transform specgram, transcript_part = self._speech_featurizer.featurize( speech_segment, self._keep_transcription_text) - + if self._normalizer: specgram = self._normalizer.apply(specgram) @@ -261,7 +261,6 @@ class SpeechCollator(): return specgram, transcript_part - # def process_utterance(self, audio_file, transcript, single=True): # """Load, augment, featurize and normalize for speech data. @@ -282,11 +281,10 @@ class SpeechCollator(): # # audio augment # self._augmentation_pipeline.transform_audio(speech_segment) - # # Spectrum transform # specgram, transcript_part = self._speech_featurizer.featurize( # speech_segment, self._keep_transcription_text) - + # if self._normalizer: # specgram = self._normalizer.apply(specgram) @@ -350,14 +348,16 @@ class SpeechCollator(): padded_texts = pad_sequence( texts, padding_value=IGNORE_ID).astype(np.int64) text_lens = np.array(text_lens).astype(np.int64) - + #spec augment - n_bins=padded_audios.shape[2] + n_bins = padded_audios.shape[2] self.randomize_feature_parameters(min(audio_lens), n_bins) for i in range(len(padded_audios)): - if not self._randomize_each_batch: + if not self._randomize_each_batch: self.randomize_feature_parameters(audio_lens[i], n_bins) - padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i]) + padded_audios[ + i] = self._augmentation_pipeline.apply_feature_transform( + padded_audios[i]) return utts, padded_audios, audio_lens, padded_texts, text_lens