From 93ae5999ae5df0fbba9d1bc83f8e593413feccc7 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Thu, 17 Jun 2021 12:12:39 +0000 Subject: [PATCH] add resampler and apply --- deepspeech/frontend/augmentor/augmentation.py | 8 +-- .../frontend/augmentor/shift_perturb.py | 27 ++++++- deepspeech/frontend/augmentor/spec_augment.py | 70 +++++++++++++++++-- .../frontend/augmentor/speed_perturb.py | 45 +++++++++--- deepspeech/io/collator.py | 14 ++-- 5 files changed, 139 insertions(+), 25 deletions(-) diff --git a/deepspeech/frontend/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py index cc0564daf..ea41a7101 100644 --- a/deepspeech/frontend/augmentor/augmentation.py +++ b/deepspeech/frontend/augmentor/augmentation.py @@ -93,7 +93,7 @@ class AugmentationPipeline(): self._spec_augmentors, self._spec_rates = self._parse_pipeline_from( augmentation_config, 'feature') - def transform_audio(self, audio_segment): + def transform_audio(self, audio_segment, single=True): """Run the pre-processing pipeline for data augmentation. Note that this is an in-place transformation. @@ -103,9 +103,9 @@ class AugmentationPipeline(): """ for augmentor, rate in zip(self._augmentors, self._rates): if self._rng.uniform(0., 1.) < rate: - augmentor.transform_audio(audio_segment) + augmentor.transform_audio(audio_segment, single) - def transform_feature(self, spec_segment): + def transform_feature(self, spec_segment, single=True): """spectrogram augmentation. Args: @@ -113,7 +113,7 @@ class AugmentationPipeline(): """ for augmentor, rate in zip(self._spec_augmentors, self._spec_rates): if self._rng.uniform(0., 1.) < rate: - spec_segment = augmentor.transform_feature(spec_segment) + spec_segment = augmentor.transform_feature(spec_segment, single) return spec_segment def _parse_pipeline_from(self, config_json, aug_type='audio'): diff --git a/deepspeech/frontend/augmentor/shift_perturb.py b/deepspeech/frontend/augmentor/shift_perturb.py index 9cc3fe2d0..8acfb5e54 100644 --- a/deepspeech/frontend/augmentor/shift_perturb.py +++ b/deepspeech/frontend/augmentor/shift_perturb.py @@ -31,7 +31,13 @@ class ShiftPerturbAugmentor(AugmentorBase): self._max_shift_ms = max_shift_ms self._rng = rng - def transform_audio(self, audio_segment): + def randomize_parameters(self): + self.shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) + + def apply(self, audio_segment): + audio_segment.shift(self.shift_ms) + + def transform_audio(self, audio_segment, single): """Shift audio. Note that this is an in-place transformation. @@ -39,5 +45,20 @@ class ShiftPerturbAugmentor(AugmentorBase): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegmenet|SpeechSegment """ - shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) - audio_segment.shift(shift_ms) + if(single): + self.randomize_parameters() + self.apply(audio_segment) + + + # def transform_audio(self, audio_segment): + # """Shift audio. + + # Note that this is an in-place transformation. + + # :param audio_segment: Audio segment to add effects to. + # :type audio_segment: AudioSegmenet|SpeechSegment + # """ + # shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) + # audio_segment.shift(shift_ms) + + diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py index 1c2e09fc7..c0245537a 100644 --- a/deepspeech/frontend/augmentor/spec_augment.py +++ b/deepspeech/frontend/augmentor/spec_augment.py @@ -123,6 +123,54 @@ class SpecAugmentor(AugmentorBase): def time_warp(xs, W=40): raise NotImplementedError + + def randomize_parameters(self, xs): + n_bins = xs.shape[0] + n_frames = xs.shape[1] + + self.f=[] + self.f_0=[] + self.t=[] + self.t_0=[] + + for i in range(0, self.n_freq_masks): + f=int(self._rng.uniform(low=0, high=self.F)) + self.f.append(f) + self.f_0.append(int(self._rng.uniform(low=0, high=n_bins - f))) + + if self.adaptive_number_ratio > 0: + n_masks = int(n_frames * self.adaptive_number_ratio) + self.n_masks = min(n_masks, self.max_n_time_masks) + else: + self.n_masks = self.n_time_masks + + if self.adaptive_size_ratio > 0: + T = self.adaptive_size_ratio * n_frames + else: + T = self.T + + for i in range(self.n_masks): + t = int(self._rng.uniform(low=0, high=T)) + t = min(t, int(n_frames * self.p)) + self.t.append(t) + self.t_0.append(int(self._rng.uniform(low=0, high=n_frames - t))) + + def apply(self, xs: np.ndarray): + n_bins = xs.shape[0] + n_frames = xs.shape[1] + for i in range(0, self.n_freq_masks): + f = self.f[i] + f_0 = self.f_0[i] + xs[f_0:f_0 + f, :] = 0 + assert f_0 <= f_0 + f + + for i in range(self.n_masks): + t = self.t[i] + t_0 = self.t_0[i] + xs[:, t_0:t_0 + t] = 0 + assert t_0 <= t_0 + t + return xs + def mask_freq(self, xs, replace_with_zero=False): n_bins = xs.shape[0] @@ -157,14 +205,26 @@ class SpecAugmentor(AugmentorBase): self._time_mask = (t_0, t_0 + t) return xs - def transform_feature(self, xs: np.ndarray): + + def transform_feature(self, xs: np.ndarray, single=True): """ Args: xs (FloatTensor): `[F, T]` Returns: xs (FloatTensor): `[F, T]` """ - # xs = self.time_warp(xs) - xs = self.mask_freq(xs) - xs = self.mask_time(xs) - return xs + if(single): + self.randomize_parameters(xs) + return self.apply(xs) + + # def transform_feature(self, xs: np.ndarray): + # """ + # Args: + # xs (FloatTensor): `[F, T]` + # Returns: + # xs (FloatTensor): `[F, T]` + # """ + # # xs = self.time_warp(xs) + # xs = self.mask_freq(xs) + # xs = self.mask_time(xs) + # return xs diff --git a/deepspeech/frontend/augmentor/speed_perturb.py b/deepspeech/frontend/augmentor/speed_perturb.py index d0977c131..5c5fd0721 100644 --- a/deepspeech/frontend/augmentor/speed_perturb.py +++ b/deepspeech/frontend/augmentor/speed_perturb.py @@ -79,7 +79,21 @@ class SpeedPerturbAugmentor(AugmentorBase): self._rates = np.linspace( self._min_rate, self._max_rate, self._num_rates, endpoint=True) - def transform_audio(self, audio_segment): + + def randomize_parameters(self): + if self._num_rates < 0: + self.speed_rate = self._rng.uniform(self._min_rate, self._max_rate) + else: + self.speed_rate = self._rng.choice(self._rates) + + def apply(self, audio_segment): + # Skip perturbation in case of identity speed rate + if speed_rate == 1.0: + return + + audio_segment.change_speed(speed_rate) + + def transform_audio(self, audio_segment,single=True): """Sample a new speed rate from the given range and changes the speed of the given audio clip. @@ -88,13 +102,26 @@ class SpeedPerturbAugmentor(AugmentorBase): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegment|SpeechSegment """ - if self._num_rates < 0: - speed_rate = self._rng.uniform(self._min_rate, self._max_rate) - else: - speed_rate = self._rng.choice(self._rates) + if(single): + self.randomize_parameters() + self.apply(audio_segment) - # Skip perturbation in case of identity speed rate - if speed_rate == 1.0: - return + # def transform_audio(self, audio_segment): + # """Sample a new speed rate from the given range and + # changes the speed of the given audio clip. - audio_segment.change_speed(speed_rate) + # Note that this is an in-place transformation. + + # :param audio_segment: Audio segment to add effects to. + # :type audio_segment: AudioSegment|SpeechSegment + # """ + # if self._num_rates < 0: + # speed_rate = self._rng.uniform(self._min_rate, self._max_rate) + # else: + # speed_rate = self._rng.choice(self._rates) + + # # Skip perturbation in case of identity speed rate + # if speed_rate == 1.0: + # return + + # audio_segment.change_speed(speed_rate) diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 8b8575dbd..79002f5f1 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -192,7 +192,7 @@ class SpeechCollator(): return self._local_data.tar2object[tarpath].extractfile( self._local_data.tar2info[tarpath][filename]) - def process_utterance(self, audio_file, transcript): + def process_utterance(self, audio_file, transcript, single=True): """Load, augment, featurize and normalize for speech data. :param audio_file: Filepath or file object of audio file. @@ -214,7 +214,7 @@ class SpeechCollator(): # audio augment start_time = time.time() - self._augmentation_pipeline.transform_audio(speech_segment) + self._augmentation_pipeline.transform_audio(speech_segment, single) audio_aug_time = time.time() - start_time #logger.debug(f"audio augmentation time: {audio_aug_time}") @@ -228,7 +228,7 @@ class SpeechCollator(): # specgram augment start_time = time.time() - specgram = self._augmentation_pipeline.transform_feature(specgram) + specgram = self._augmentation_pipeline.transform_feature(specgram, single) feature_aug_time = time.time() - start_time #logger.debug(f"audio feature augmentation time: {feature_aug_time}") return specgram, transcript_part @@ -253,8 +253,14 @@ class SpeechCollator(): texts = [] text_lens = [] utts = [] + # print('----debug---') + # print(batch) + # print(type(batch)) + # print(len(batch)) + resample=True for utt, audio, text in batch: - audio, text = self.process_utterance(audio, text) + audio, text = self.process_utterance(audio, text, single=resample) + # resample=False #utt utts.append(utt) # audio