From 50f10f37ae8224a5d143ecfc59e31af1d992e695 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 20 Aug 2021 03:28:55 +0000 Subject: [PATCH] support replace with mean by aug --- deepspeech/__init__.py | 41 +------------------ .../frontend/augmentor/impulse_response.py | 2 +- .../frontend/augmentor/noise_perturb.py | 2 +- .../online_bayesian_normalization.py | 2 +- deepspeech/frontend/augmentor/resample.py | 2 +- .../frontend/augmentor/shift_perturb.py | 2 +- deepspeech/frontend/augmentor/spec_augment.py | 21 +++++++--- .../frontend/augmentor/speed_perturb.py | 2 +- .../frontend/augmentor/volume_perturb.py | 2 +- examples/aishell/s0/conf/augmentation.json | 3 +- examples/aishell/s1/conf/augmentation.json | 3 +- examples/aug_conf/augmentation.json | 10 ----- .../augmentation.json} | 3 +- examples/callcenter/s1/conf/augmentation.json | 3 +- .../librispeech/s0/conf/augmentation.json | 3 +- .../librispeech/s1/conf/augmentation.json | 3 +- .../librispeech/s2/conf/augmentation.json | 3 +- examples/timit/s1/conf/augmentation.json | 3 +- examples/tiny/s0/conf/augmentation.json | 25 +++++++++++ examples/tiny/s1/conf/augmentation.json | 3 +- 20 files changed, 66 insertions(+), 72 deletions(-) delete mode 100644 examples/aug_conf/augmentation.json rename examples/{aug_conf/augmentation.example.json => augmentation/augmentation.json} (94%) diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index 88f81075..fbec5a5e 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -352,45 +352,6 @@ if not hasattr(paddle.Tensor, 'tolist'): "register user tolist to paddle.Tensor, remove this when fixed!") setattr(paddle.Tensor, 'tolist', tolist) -########### hcak paddle.nn.functional ############# - - -def glu(x: paddle.Tensor, axis=-1) -> paddle.Tensor: - """The gated linear unit (GLU) activation.""" - a, b = x.split(2, axis=axis) - act_b = F.sigmoid(b) - return a * act_b - - -if not hasattr(paddle.nn.functional, 'glu'): - logger.warn( - "register user glu to paddle.nn.functional, remove this when fixed!") - setattr(paddle.nn.functional, 'glu', glu) - -# def softplus(x): -# """Softplus function.""" -# if hasattr(paddle.nn.functional, 'softplus'): -# #return paddle.nn.functional.softplus(x.float()).type_as(x) -# return paddle.nn.functional.softplus(x) -# else: -# raise NotImplementedError - -# def gelu_accurate(x): -# """Gaussian Error Linear Units (GELU) activation.""" -# # [reference] https://github.com/pytorch/fairseq/blob/e75cff5f2c1d62f12dc911e0bf420025eb1a4e33/fairseq/modules/gelu.py -# if not hasattr(gelu_accurate, "_a"): -# gelu_accurate._a = math.sqrt(2 / math.pi) -# return 0.5 * x * (1 + paddle.tanh(gelu_accurate._a * -# (x + 0.044715 * paddle.pow(x, 3)))) - -# def gelu(x): -# """Gaussian Error Linear Units (GELU) activation.""" -# if hasattr(nn.functional, 'gelu'): -# #return nn.functional.gelu(x.float()).type_as(x) -# return nn.functional.gelu(x) -# else: -# return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0))) - ########### hcak paddle.nn ############# class GLU(nn.Layer): @@ -401,7 +362,7 @@ class GLU(nn.Layer): self.dim = dim def forward(self, xs): - return glu(xs, dim=self.dim) + return F.glu(xs, dim=self.dim) if not hasattr(paddle.nn, 'GLU'): diff --git a/deepspeech/frontend/augmentor/impulse_response.py b/deepspeech/frontend/augmentor/impulse_response.py index b1a732ad..818251ed 100644 --- a/deepspeech/frontend/augmentor/impulse_response.py +++ b/deepspeech/frontend/augmentor/impulse_response.py @@ -32,7 +32,7 @@ class ImpulseResponseAugmentor(AugmentorBase): def __call__(self, x, uttid=None, train=True): if not train: - return + return x self.transform_audio(x) return x diff --git a/deepspeech/frontend/augmentor/noise_perturb.py b/deepspeech/frontend/augmentor/noise_perturb.py index 8be5931b..790b0c39 100644 --- a/deepspeech/frontend/augmentor/noise_perturb.py +++ b/deepspeech/frontend/augmentor/noise_perturb.py @@ -38,7 +38,7 @@ class NoisePerturbAugmentor(AugmentorBase): def __call__(self, x, uttid=None, train=True): if not train: - return + return x self.transform_audio(x) return x diff --git a/deepspeech/frontend/augmentor/online_bayesian_normalization.py b/deepspeech/frontend/augmentor/online_bayesian_normalization.py index 4b5e2301..0f9d3ef6 100644 --- a/deepspeech/frontend/augmentor/online_bayesian_normalization.py +++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py @@ -46,7 +46,7 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase): def __call__(self, x, uttid=None, train=True): if not train: - return + return x self.transform_audio(x) return x diff --git a/deepspeech/frontend/augmentor/resample.py b/deepspeech/frontend/augmentor/resample.py index a8c0c662..509fe003 100644 --- a/deepspeech/frontend/augmentor/resample.py +++ b/deepspeech/frontend/augmentor/resample.py @@ -33,7 +33,7 @@ class ResampleAugmentor(AugmentorBase): def __call__(self, x, uttid=None, train=True): if not train: - return + return x self.transform_audio(x) return x diff --git a/deepspeech/frontend/augmentor/shift_perturb.py b/deepspeech/frontend/augmentor/shift_perturb.py index a76fb51c..8b7439fe 100644 --- a/deepspeech/frontend/augmentor/shift_perturb.py +++ b/deepspeech/frontend/augmentor/shift_perturb.py @@ -33,7 +33,7 @@ class ShiftPerturbAugmentor(AugmentorBase): def __call__(self, x, uttid=None, train=True): if not train: - return + return x self.transform_audio(x) return x diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py index bfa8300a..67b6cfdd 100644 --- a/deepspeech/frontend/augmentor/spec_augment.py +++ b/deepspeech/frontend/augmentor/spec_augment.py @@ -41,7 +41,8 @@ class SpecAugmentor(AugmentorBase): W=40, adaptive_number_ratio=0, adaptive_size_ratio=0, - max_n_time_masks=20): + max_n_time_masks=20, + replace_with_zero=True): """SpecAugment class. Args: rng (random.Random): random generator object. @@ -54,9 +55,11 @@ class SpecAugmentor(AugmentorBase): adaptive_number_ratio (float): adaptive multiplicity ratio for time masking adaptive_size_ratio (float): adaptive size ratio for time masking max_n_time_masks (int): maximum number of time masking + replace_with_zero (bool): pad zero on mask if true else use mean """ super().__init__() self._rng = rng + self.replace_with_zero = replace_with_zero self.W = W self.F = F @@ -124,15 +127,18 @@ class SpecAugmentor(AugmentorBase): return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}" def time_warp(xs, W=40): - raise NotImplementedError + return xs def mask_freq(self, xs, replace_with_zero=False): n_bins = xs.shape[0] for i in range(0, self.n_freq_masks): f = int(self._rng.uniform(low=0, high=self.F)) f_0 = int(self._rng.uniform(low=0, high=n_bins - f)) - xs[f_0:f_0 + f, :] = 0 assert f_0 <= f_0 + f + if self.replace_with_zero: + xs[f_0:f_0 + f, :] = 0 + else: + xs[f_0:f_0 + f, :] = xs.mean() self._freq_mask = (f_0, f_0 + f) return xs @@ -154,14 +160,17 @@ class SpecAugmentor(AugmentorBase): t = int(self._rng.uniform(low=0, high=T)) t = min(t, int(n_frames * self.p)) t_0 = int(self._rng.uniform(low=0, high=n_frames - t)) - xs[:, t_0:t_0 + t] = 0 assert t_0 <= t_0 + t + if self.replace_with_zero: + xs[:, t_0:t_0 + t] = 0 + else: + xs[:, t_0:t_0 + t] = xs.mean() self._time_mask = (t_0, t_0 + t) return xs def __call__(self, x, train=True): if not train: - return + return x return self.transform_feature(x) def transform_feature(self, xs: np.ndarray): @@ -171,7 +180,7 @@ class SpecAugmentor(AugmentorBase): Returns: xs (FloatTensor): `[F, T]` """ - # xs = self.time_warp(xs) + xs = self.time_warp(xs) xs = self.mask_freq(xs) xs = self.mask_time(xs) return xs diff --git a/deepspeech/frontend/augmentor/speed_perturb.py b/deepspeech/frontend/augmentor/speed_perturb.py index eec2e551..ce8dfde0 100644 --- a/deepspeech/frontend/augmentor/speed_perturb.py +++ b/deepspeech/frontend/augmentor/speed_perturb.py @@ -81,7 +81,7 @@ class SpeedPerturbAugmentor(AugmentorBase): def __call__(self, x, uttid=None, train=True): if not train: - return + return x self.transform_audio(x) return x diff --git a/deepspeech/frontend/augmentor/volume_perturb.py b/deepspeech/frontend/augmentor/volume_perturb.py index d08f75c3..70cb2889 100644 --- a/deepspeech/frontend/augmentor/volume_perturb.py +++ b/deepspeech/frontend/augmentor/volume_perturb.py @@ -39,7 +39,7 @@ class VolumePerturbAugmentor(AugmentorBase): def __call__(self, x, uttid=None, train=True): if not train: - return + return x self.transform_audio(x) return x diff --git a/examples/aishell/s0/conf/augmentation.json b/examples/aishell/s0/conf/augmentation.json index 1987ad42..81d110b0 100644 --- a/examples/aishell/s0/conf/augmentation.json +++ b/examples/aishell/s0/conf/augmentation.json @@ -27,7 +27,8 @@ "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, - "max_n_time_masks": 20 + "max_n_time_masks": 20, + "replace_with_zero": true }, "prob": 1.0 } diff --git a/examples/aishell/s1/conf/augmentation.json b/examples/aishell/s1/conf/augmentation.json index 1987ad42..81d110b0 100644 --- a/examples/aishell/s1/conf/augmentation.json +++ b/examples/aishell/s1/conf/augmentation.json @@ -27,7 +27,8 @@ "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, - "max_n_time_masks": 20 + "max_n_time_masks": 20, + "replace_with_zero": true }, "prob": 1.0 } diff --git a/examples/aug_conf/augmentation.json b/examples/aug_conf/augmentation.json deleted file mode 100644 index a1a759e6..00000000 --- a/examples/aug_conf/augmentation.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "type": "shift", - "params": { - "min_shift_ms": -5, - "max_shift_ms": 5 - }, - "prob": 1.0 - } -] diff --git a/examples/aug_conf/augmentation.example.json b/examples/augmentation/augmentation.json similarity index 94% rename from examples/aug_conf/augmentation.example.json rename to examples/augmentation/augmentation.json index efae2e5e..baf2cac3 100644 --- a/examples/aug_conf/augmentation.example.json +++ b/examples/augmentation/augmentation.json @@ -60,7 +60,8 @@ "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, - "max_n_time_masks": 20 + "max_n_time_masks": 20, + "replace_with_zero": true }, "prob": 0.0 } diff --git a/examples/callcenter/s1/conf/augmentation.json b/examples/callcenter/s1/conf/augmentation.json index 1987ad42..81d110b0 100644 --- a/examples/callcenter/s1/conf/augmentation.json +++ b/examples/callcenter/s1/conf/augmentation.json @@ -27,7 +27,8 @@ "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, - "max_n_time_masks": 20 + "max_n_time_masks": 20, + "replace_with_zero": true }, "prob": 1.0 } diff --git a/examples/librispeech/s0/conf/augmentation.json b/examples/librispeech/s0/conf/augmentation.json index 1987ad42..81d110b0 100644 --- a/examples/librispeech/s0/conf/augmentation.json +++ b/examples/librispeech/s0/conf/augmentation.json @@ -27,7 +27,8 @@ "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, - "max_n_time_masks": 20 + "max_n_time_masks": 20, + "replace_with_zero": true }, "prob": 1.0 } diff --git a/examples/librispeech/s1/conf/augmentation.json b/examples/librispeech/s1/conf/augmentation.json index c1078393..7dd158eb 100644 --- a/examples/librispeech/s1/conf/augmentation.json +++ b/examples/librispeech/s1/conf/augmentation.json @@ -27,7 +27,8 @@ "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, - "max_n_time_masks": 20 + "max_n_time_masks": 20, + "replace_with_zero": true }, "prob": 1.0 } diff --git a/examples/librispeech/s2/conf/augmentation.json b/examples/librispeech/s2/conf/augmentation.json index 49fe333e..cc8c7e00 100644 --- a/examples/librispeech/s2/conf/augmentation.json +++ b/examples/librispeech/s2/conf/augmentation.json @@ -10,7 +10,8 @@ "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, - "max_n_time_masks": 20 + "max_n_time_masks": 20, + "replace_with_zero": true }, "prob": 1.0 } diff --git a/examples/timit/s1/conf/augmentation.json b/examples/timit/s1/conf/augmentation.json index c1078393..7dd158eb 100644 --- a/examples/timit/s1/conf/augmentation.json +++ b/examples/timit/s1/conf/augmentation.json @@ -27,7 +27,8 @@ "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, - "max_n_time_masks": 20 + "max_n_time_masks": 20, + "replace_with_zero": true }, "prob": 1.0 } diff --git a/examples/tiny/s0/conf/augmentation.json b/examples/tiny/s0/conf/augmentation.json index a1a759e6..8f9ff7fd 100644 --- a/examples/tiny/s0/conf/augmentation.json +++ b/examples/tiny/s0/conf/augmentation.json @@ -1,4 +1,13 @@ [ + { + "type": "speed", + "params": { + "min_speed_rate": 0.9, + "max_speed_rate": 1.1, + "num_rates": 3 + }, + "prob": 1.0 + }, { "type": "shift", "params": { @@ -6,5 +15,21 @@ "max_shift_ms": 5 }, "prob": 1.0 + }, + { + "type": "specaug", + "params": { + "F": 10, + "T": 50, + "n_freq_masks": 2, + "n_time_masks": 2, + "p": 1.0, + "W": 80, + "adaptive_number_ratio": 0, + "adaptive_size_ratio": 0, + "max_n_time_masks": 20, + "replace_with_zero": true + }, + "prob": 1.0 } ] diff --git a/examples/tiny/s1/conf/augmentation.json b/examples/tiny/s1/conf/augmentation.json index f26c282e..8f9ff7fd 100644 --- a/examples/tiny/s1/conf/augmentation.json +++ b/examples/tiny/s1/conf/augmentation.json @@ -27,7 +27,8 @@ "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, - "max_n_time_masks": 20 + "max_n_time_masks": 20, + "replace_with_zero": true }, "prob": 1.0 }