fix

4 years ago · aaeef54f46
parent 82ca0f6549
commit aaeef54f46
6 changed files with 28 additions and 33 deletions
--- a/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/deepspeech/frontend/augmentor/shift_perturb.py
@ -32,7 +32,8 @@ class ShiftPerturbAugmentor(AugmentorBase):
        self._rng = rng

    def randomize_parameters(self):
-        self.shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
+        self.shift_ms = self._rng.uniform(self._min_shift_ms,
+                                          self._max_shift_ms)

    def apply(self, audio_segment):
        audio_segment.shift(self.shift_ms)
@ -49,7 +50,6 @@ class ShiftPerturbAugmentor(AugmentorBase):
    #         self.randomize_parameters()
    #     self.apply(audio_segment)

-
    # def transform_audio(self, audio_segment):
    #     """Shift audio.

@ -60,5 +60,3 @@ class ShiftPerturbAugmentor(AugmentorBase):
    #     """
    #     shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
    #     audio_segment.shift(shift_ms)
-
-
--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@ -128,13 +128,13 @@ class SpecAugmentor(AugmentorBase):
        # n_bins = xs.shape[0]
        # n_frames = xs.shape[1]

-        self.f=[]
-        self.f_0=[]
-        self.t=[]
-        self.t_0=[]
+        self.f = []
+        self.f_0 = []
+        self.t = []
+        self.t_0 = []

        for i in range(0, self.n_freq_masks):
-            f=int(self._rng.uniform(low=0, high=self.F))
+            f = int(self._rng.uniform(low=0, high=self.F))
            self.f.append(f)
            self.f_0.append(int(self._rng.uniform(low=0, high=n_bins - f)))

@ -174,7 +174,6 @@ class SpecAugmentor(AugmentorBase):
            assert t_0 <= t_0 + t
        return xs

-
    # def mask_freq(self, xs, replace_with_zero=False):
    #     n_bins = xs.shape[0]
    #     for i in range(0, self.n_freq_masks):
@ -208,7 +207,6 @@ class SpecAugmentor(AugmentorBase):
    #         self._time_mask = (t_0, t_0 + t)
    #     return xs

-
    # def transform_feature(self, xs: np.ndarray, single=True):
    #     """
    #     Args:
--- a/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/deepspeech/frontend/augmentor/speed_perturb.py
@ -79,7 +79,6 @@ class SpeedPerturbAugmentor(AugmentorBase):
            self._rates = np.linspace(
                self._min_rate, self._max_rate, self._num_rates, endpoint=True)

-
    def randomize_parameters(self):
        if self._num_rates < 0:
            self.speed_rate = self._rng.uniform(self._min_rate, self._max_rate)
@ -93,7 +92,7 @@ class SpeedPerturbAugmentor(AugmentorBase):

        audio_segment.change_speed(speed_rate)

-    def transform_audio(self, audio_segment,single=True):
+    def transform_audio(self, audio_segment, single=True):
        """Sample a new speed rate from the given range and
        changes the speed of the given audio clip.

@ -102,7 +101,7 @@ class SpeedPerturbAugmentor(AugmentorBase):
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegment|SpeechSegment
        """
-        if(single):
+        if (single):
            self.randomize_parameters()
        self.apply(audio_segment)

--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -174,7 +174,6 @@ class SpeechCollator():
        self._stride_ms = stride_ms
        self._target_sample_rate = target_sample_rate

-
        self._speech_featurizer = SpeechFeaturizer(
            unit_type=unit_type,
            vocab_filepath=vocab_filepath,
@ -231,7 +230,8 @@ class SpeechCollator():
        self._augmentation_pipeline.randomize_parameters_audio_transform()

    def randomize_feature_parameters(self, n_frames, n_bins):
-        self._augmentation_pipeline.randomize_parameters_feature_transform(n_frames, n_bins)
+        self._augmentation_pipeline.randomize_parameters_feature_transform(
+            n_frames, n_bins)

    def process_feature_and_transform(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.
@ -261,7 +261,6 @@ class SpeechCollator():

        return specgram, transcript_part

-
    # def process_utterance(self, audio_file, transcript, single=True):
    #     """Load, augment, featurize and normalize for speech data.

@ -282,7 +281,6 @@ class SpeechCollator():
    #     # audio augment
    #     self._augmentation_pipeline.transform_audio(speech_segment)

-
    #     # Spectrum transform
    #     specgram, transcript_part = self._speech_featurizer.featurize(
    #         speech_segment, self._keep_transcription_text)
@ -352,12 +350,14 @@ class SpeechCollator():
        text_lens = np.array(text_lens).astype(np.int64)

        #spec augment
-        n_bins=padded_audios.shape[2]
+        n_bins = padded_audios.shape[2]
        self.randomize_feature_parameters(min(audio_lens), n_bins)
        for i in range(len(padded_audios)):
            if not self._randomize_each_batch:
                self.randomize_feature_parameters(audio_lens[i], n_bins)
-            padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i])
+            padded_audios[
+                i] = self._augmentation_pipeline.apply_feature_transform(
+                    padded_audios[i])

        return utts, padded_audios, audio_lens, padded_texts, text_lens