From 50f10f37ae8224a5d143ecfc59e31af1d992e695 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 20 Aug 2021 03:28:55 +0000
Subject: [PATCH] support replace with mean by aug

---
 deepspeech/__init__.py                        | 41 +------------------
 .../frontend/augmentor/impulse_response.py    |  2 +-
 .../frontend/augmentor/noise_perturb.py       |  2 +-
 .../online_bayesian_normalization.py          |  2 +-
 deepspeech/frontend/augmentor/resample.py     |  2 +-
 .../frontend/augmentor/shift_perturb.py       |  2 +-
 deepspeech/frontend/augmentor/spec_augment.py | 21 +++++++---
 .../frontend/augmentor/speed_perturb.py       |  2 +-
 .../frontend/augmentor/volume_perturb.py      |  2 +-
 examples/aishell/s0/conf/augmentation.json    |  3 +-
 examples/aishell/s1/conf/augmentation.json    |  3 +-
 examples/aug_conf/augmentation.json           | 10 -----
 .../augmentation.json}                        |  3 +-
 examples/callcenter/s1/conf/augmentation.json |  3 +-
 .../librispeech/s0/conf/augmentation.json     |  3 +-
 .../librispeech/s1/conf/augmentation.json     |  3 +-
 .../librispeech/s2/conf/augmentation.json     |  3 +-
 examples/timit/s1/conf/augmentation.json      |  3 +-
 examples/tiny/s0/conf/augmentation.json       | 25 +++++++++++
 examples/tiny/s1/conf/augmentation.json       |  3 +-
 20 files changed, 66 insertions(+), 72 deletions(-)
 delete mode 100644 examples/aug_conf/augmentation.json
 rename examples/{aug_conf/augmentation.example.json => augmentation/augmentation.json} (94%)

diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py
index 88f81075..fbec5a5e 100644
--- a/deepspeech/__init__.py
+++ b/deepspeech/__init__.py
@@ -352,45 +352,6 @@ if not hasattr(paddle.Tensor, 'tolist'):
         "register user tolist to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'tolist', tolist)
 
-########### hcak paddle.nn.functional #############
-
-
-def glu(x: paddle.Tensor, axis=-1) -> paddle.Tensor:
-    """The gated linear unit (GLU) activation."""
-    a, b = x.split(2, axis=axis)
-    act_b = F.sigmoid(b)
-    return a * act_b
-
-
-if not hasattr(paddle.nn.functional, 'glu'):
-    logger.warn(
-        "register user glu to paddle.nn.functional, remove this when fixed!")
-    setattr(paddle.nn.functional, 'glu', glu)
-
-# def softplus(x):
-#     """Softplus function."""
-#     if hasattr(paddle.nn.functional, 'softplus'):
-#         #return paddle.nn.functional.softplus(x.float()).type_as(x)
-#         return paddle.nn.functional.softplus(x)
-#     else:
-#         raise NotImplementedError
-
-# def gelu_accurate(x):
-#     """Gaussian Error Linear Units (GELU) activation."""
-#     # [reference] https://github.com/pytorch/fairseq/blob/e75cff5f2c1d62f12dc911e0bf420025eb1a4e33/fairseq/modules/gelu.py
-#     if not hasattr(gelu_accurate, "_a"):
-#         gelu_accurate._a = math.sqrt(2 / math.pi)
-#     return 0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
-#                                       (x + 0.044715 * paddle.pow(x, 3))))
-
-# def gelu(x):
-#     """Gaussian Error Linear Units (GELU) activation."""
-#     if hasattr(nn.functional, 'gelu'):
-#         #return nn.functional.gelu(x.float()).type_as(x)
-#         return nn.functional.gelu(x)
-#     else:
-#         return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
-
 
 ########### hcak paddle.nn #############
 class GLU(nn.Layer):
@@ -401,7 +362,7 @@ class GLU(nn.Layer):
         self.dim = dim
 
     def forward(self, xs):
-        return glu(xs, dim=self.dim)
+        return F.glu(xs, dim=self.dim)
 
 
 if not hasattr(paddle.nn, 'GLU'):
diff --git a/deepspeech/frontend/augmentor/impulse_response.py b/deepspeech/frontend/augmentor/impulse_response.py
index b1a732ad..818251ed 100644
--- a/deepspeech/frontend/augmentor/impulse_response.py
+++ b/deepspeech/frontend/augmentor/impulse_response.py
@@ -32,7 +32,7 @@ class ImpulseResponseAugmentor(AugmentorBase):
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
-            return
+            return x
         self.transform_audio(x)
         return x
 
diff --git a/deepspeech/frontend/augmentor/noise_perturb.py b/deepspeech/frontend/augmentor/noise_perturb.py
index 8be5931b..790b0c39 100644
--- a/deepspeech/frontend/augmentor/noise_perturb.py
+++ b/deepspeech/frontend/augmentor/noise_perturb.py
@@ -38,7 +38,7 @@ class NoisePerturbAugmentor(AugmentorBase):
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
-            return
+            return x
         self.transform_audio(x)
         return x
 
diff --git a/deepspeech/frontend/augmentor/online_bayesian_normalization.py b/deepspeech/frontend/augmentor/online_bayesian_normalization.py
index 4b5e2301..0f9d3ef6 100644
--- a/deepspeech/frontend/augmentor/online_bayesian_normalization.py
+++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py
@@ -46,7 +46,7 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase):
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
-            return
+            return x
         self.transform_audio(x)
         return x
 
diff --git a/deepspeech/frontend/augmentor/resample.py b/deepspeech/frontend/augmentor/resample.py
index a8c0c662..509fe003 100644
--- a/deepspeech/frontend/augmentor/resample.py
+++ b/deepspeech/frontend/augmentor/resample.py
@@ -33,7 +33,7 @@ class ResampleAugmentor(AugmentorBase):
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
-            return
+            return x
         self.transform_audio(x)
         return x
 
diff --git a/deepspeech/frontend/augmentor/shift_perturb.py b/deepspeech/frontend/augmentor/shift_perturb.py
index a76fb51c..8b7439fe 100644
--- a/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/deepspeech/frontend/augmentor/shift_perturb.py
@@ -33,7 +33,7 @@ class ShiftPerturbAugmentor(AugmentorBase):
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
-            return
+            return x
         self.transform_audio(x)
         return x
 
diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py
index bfa8300a..67b6cfdd 100644
--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@@ -41,7 +41,8 @@ class SpecAugmentor(AugmentorBase):
                  W=40,
                  adaptive_number_ratio=0,
                  adaptive_size_ratio=0,
-                 max_n_time_masks=20):
+                 max_n_time_masks=20,
+                 replace_with_zero=True):
         """SpecAugment class.
         Args:
             rng (random.Random): random generator object.
@@ -54,9 +55,11 @@ class SpecAugmentor(AugmentorBase):
             adaptive_number_ratio (float): adaptive multiplicity ratio for time masking
             adaptive_size_ratio (float): adaptive size ratio for time masking
             max_n_time_masks (int): maximum number of time masking
+            replace_with_zero (bool): pad zero on mask if true else use mean
         """
         super().__init__()
         self._rng = rng
+        self.replace_with_zero = replace_with_zero
 
         self.W = W
         self.F = F
@@ -124,15 +127,18 @@ class SpecAugmentor(AugmentorBase):
         return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}"
 
     def time_warp(xs, W=40):
-        raise NotImplementedError
+        return xs
 
     def mask_freq(self, xs, replace_with_zero=False):
         n_bins = xs.shape[0]
         for i in range(0, self.n_freq_masks):
             f = int(self._rng.uniform(low=0, high=self.F))
             f_0 = int(self._rng.uniform(low=0, high=n_bins - f))
-            xs[f_0:f_0 + f, :] = 0
             assert f_0 <= f_0 + f
+            if self.replace_with_zero:
+                xs[f_0:f_0 + f, :] = 0
+            else:
+                xs[f_0:f_0 + f, :] = xs.mean()
             self._freq_mask = (f_0, f_0 + f)
         return xs
 
@@ -154,14 +160,17 @@ class SpecAugmentor(AugmentorBase):
             t = int(self._rng.uniform(low=0, high=T))
             t = min(t, int(n_frames * self.p))
             t_0 = int(self._rng.uniform(low=0, high=n_frames - t))
-            xs[:, t_0:t_0 + t] = 0
             assert t_0 <= t_0 + t
+            if self.replace_with_zero:
+                xs[:, t_0:t_0 + t] = 0
+            else:
+                xs[:, t_0:t_0 + t] = xs.mean()
             self._time_mask = (t_0, t_0 + t)
         return xs
 
     def __call__(self, x, train=True):
         if not train:
-            return
+            return x
         return self.transform_feature(x)
 
     def transform_feature(self, xs: np.ndarray):
@@ -171,7 +180,7 @@ class SpecAugmentor(AugmentorBase):
         Returns:
             xs (FloatTensor): `[F, T]`
         """
-        # xs = self.time_warp(xs)
+        xs = self.time_warp(xs)
         xs = self.mask_freq(xs)
         xs = self.mask_time(xs)
         return xs
diff --git a/deepspeech/frontend/augmentor/speed_perturb.py b/deepspeech/frontend/augmentor/speed_perturb.py
index eec2e551..ce8dfde0 100644
--- a/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/deepspeech/frontend/augmentor/speed_perturb.py
@@ -81,7 +81,7 @@ class SpeedPerturbAugmentor(AugmentorBase):
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
-            return
+            return x
         self.transform_audio(x)
         return x
 
diff --git a/deepspeech/frontend/augmentor/volume_perturb.py b/deepspeech/frontend/augmentor/volume_perturb.py
index d08f75c3..70cb2889 100644
--- a/deepspeech/frontend/augmentor/volume_perturb.py
+++ b/deepspeech/frontend/augmentor/volume_perturb.py
@@ -39,7 +39,7 @@ class VolumePerturbAugmentor(AugmentorBase):
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
-            return
+            return x
         self.transform_audio(x)
         return x
 
diff --git a/examples/aishell/s0/conf/augmentation.json b/examples/aishell/s0/conf/augmentation.json
index 1987ad42..81d110b0 100644
--- a/examples/aishell/s0/conf/augmentation.json
+++ b/examples/aishell/s0/conf/augmentation.json
@@ -27,7 +27,8 @@
       "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
-      "max_n_time_masks": 20
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
     },
     "prob": 1.0
   }
diff --git a/examples/aishell/s1/conf/augmentation.json b/examples/aishell/s1/conf/augmentation.json
index 1987ad42..81d110b0 100644
--- a/examples/aishell/s1/conf/augmentation.json
+++ b/examples/aishell/s1/conf/augmentation.json
@@ -27,7 +27,8 @@
       "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
-      "max_n_time_masks": 20
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
     },
     "prob": 1.0
   }
diff --git a/examples/aug_conf/augmentation.json b/examples/aug_conf/augmentation.json
deleted file mode 100644
index a1a759e6..00000000
--- a/examples/aug_conf/augmentation.json
+++ /dev/null
@@ -1,10 +0,0 @@
-[
-  {
-    "type": "shift",
-    "params": {
-      "min_shift_ms": -5,
-      "max_shift_ms": 5
-    },
-    "prob": 1.0
-  }
-]
diff --git a/examples/aug_conf/augmentation.example.json b/examples/augmentation/augmentation.json
similarity index 94%
rename from examples/aug_conf/augmentation.example.json
rename to examples/augmentation/augmentation.json
index efae2e5e..baf2cac3 100644
--- a/examples/aug_conf/augmentation.example.json
+++ b/examples/augmentation/augmentation.json
@@ -60,7 +60,8 @@
       "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
-      "max_n_time_masks": 20
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
     },
     "prob": 0.0
   }
diff --git a/examples/callcenter/s1/conf/augmentation.json b/examples/callcenter/s1/conf/augmentation.json
index 1987ad42..81d110b0 100644
--- a/examples/callcenter/s1/conf/augmentation.json
+++ b/examples/callcenter/s1/conf/augmentation.json
@@ -27,7 +27,8 @@
       "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
-      "max_n_time_masks": 20
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
     },
     "prob": 1.0
   }
diff --git a/examples/librispeech/s0/conf/augmentation.json b/examples/librispeech/s0/conf/augmentation.json
index 1987ad42..81d110b0 100644
--- a/examples/librispeech/s0/conf/augmentation.json
+++ b/examples/librispeech/s0/conf/augmentation.json
@@ -27,7 +27,8 @@
       "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
-      "max_n_time_masks": 20
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
     },
     "prob": 1.0
   }
diff --git a/examples/librispeech/s1/conf/augmentation.json b/examples/librispeech/s1/conf/augmentation.json
index c1078393..7dd158eb 100644
--- a/examples/librispeech/s1/conf/augmentation.json
+++ b/examples/librispeech/s1/conf/augmentation.json
@@ -27,7 +27,8 @@
       "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
-      "max_n_time_masks": 20
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
     },
     "prob": 1.0
   }
diff --git a/examples/librispeech/s2/conf/augmentation.json b/examples/librispeech/s2/conf/augmentation.json
index 49fe333e..cc8c7e00 100644
--- a/examples/librispeech/s2/conf/augmentation.json
+++ b/examples/librispeech/s2/conf/augmentation.json
@@ -10,7 +10,8 @@
       "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
-      "max_n_time_masks": 20
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
     },
     "prob": 1.0
   }
diff --git a/examples/timit/s1/conf/augmentation.json b/examples/timit/s1/conf/augmentation.json
index c1078393..7dd158eb 100644
--- a/examples/timit/s1/conf/augmentation.json
+++ b/examples/timit/s1/conf/augmentation.json
@@ -27,7 +27,8 @@
       "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
-      "max_n_time_masks": 20
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
     },
     "prob": 1.0
   }
diff --git a/examples/tiny/s0/conf/augmentation.json b/examples/tiny/s0/conf/augmentation.json
index a1a759e6..8f9ff7fd 100644
--- a/examples/tiny/s0/conf/augmentation.json
+++ b/examples/tiny/s0/conf/augmentation.json
@@ -1,4 +1,13 @@
 [
+  {
+    "type": "speed",
+    "params": {
+      "min_speed_rate": 0.9,
+      "max_speed_rate": 1.1,
+      "num_rates": 3
+    },
+    "prob": 1.0
+  },
   {
     "type": "shift",
     "params": {
@@ -6,5 +15,21 @@
       "max_shift_ms": 5
     },
     "prob": 1.0
+  },
+  {
+    "type": "specaug",
+    "params": {
+      "F": 10,
+      "T": 50,
+      "n_freq_masks": 2,
+      "n_time_masks": 2,
+      "p": 1.0,
+      "W": 80,
+      "adaptive_number_ratio": 0,
+      "adaptive_size_ratio": 0,
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
+    },
+    "prob": 1.0
   }
 ]
diff --git a/examples/tiny/s1/conf/augmentation.json b/examples/tiny/s1/conf/augmentation.json
index f26c282e..8f9ff7fd 100644
--- a/examples/tiny/s1/conf/augmentation.json
+++ b/examples/tiny/s1/conf/augmentation.json
@@ -27,7 +27,8 @@
       "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
-      "max_n_time_masks": 20
+      "max_n_time_masks": 20,
+      "replace_with_zero": true
     },
     "prob": 1.0
   }