|
|
|
@ -1,3 +1,16 @@
|
|
|
|
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
import librosa
|
|
|
|
|
import numpy
|
|
|
|
|
import scipy
|
|
|
|
@ -5,6 +18,7 @@ import soundfile
|
|
|
|
|
|
|
|
|
|
from deepspeech.io.reader import SoundHDF5File
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SpeedPerturbation():
|
|
|
|
|
"""SpeedPerturbation
|
|
|
|
|
|
|
|
|
@ -28,8 +42,7 @@ class SpeedPerturbation():
|
|
|
|
|
utt2ratio=None,
|
|
|
|
|
keep_length=True,
|
|
|
|
|
res_type="kaiser_best",
|
|
|
|
|
seed=None,
|
|
|
|
|
):
|
|
|
|
|
seed=None, ):
|
|
|
|
|
self.res_type = res_type
|
|
|
|
|
self.keep_length = keep_length
|
|
|
|
|
self.state = numpy.random.RandomState(seed)
|
|
|
|
@ -60,12 +73,10 @@ class SpeedPerturbation():
|
|
|
|
|
self.lower,
|
|
|
|
|
self.upper,
|
|
|
|
|
self.keep_length,
|
|
|
|
|
self.res_type,
|
|
|
|
|
)
|
|
|
|
|
self.res_type, )
|
|
|
|
|
else:
|
|
|
|
|
return "{}({}, res_type={})".format(
|
|
|
|
|
self.__class__.__name__, self.utt2ratio_file, self.res_type
|
|
|
|
|
)
|
|
|
|
|
self.__class__.__name__, self.utt2ratio_file, self.res_type)
|
|
|
|
|
|
|
|
|
|
def __call__(self, x, uttid=None, train=True):
|
|
|
|
|
if not train:
|
|
|
|
@ -85,15 +96,14 @@ class SpeedPerturbation():
|
|
|
|
|
diff = abs(len(x) - len(y))
|
|
|
|
|
if len(y) > len(x):
|
|
|
|
|
# Truncate noise
|
|
|
|
|
y = y[diff // 2 : -((diff + 1) // 2)]
|
|
|
|
|
y = y[diff // 2:-((diff + 1) // 2)]
|
|
|
|
|
elif len(y) < len(x):
|
|
|
|
|
# Assume the time-axis is the first: (Time, Channel)
|
|
|
|
|
pad_width = [(diff // 2, (diff + 1) // 2)] + [
|
|
|
|
|
(0, 0) for _ in range(y.ndim - 1)
|
|
|
|
|
]
|
|
|
|
|
y = numpy.pad(
|
|
|
|
|
y, pad_width=pad_width, constant_values=0, mode="constant"
|
|
|
|
|
)
|
|
|
|
|
y, pad_width=pad_width, constant_values=0, mode="constant")
|
|
|
|
|
return y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -111,7 +121,7 @@ class BandpassPerturbation():
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self, lower=0.0, upper=0.75, seed=None, axes=(-1,)):
|
|
|
|
|
def __init__(self, lower=0.0, upper=0.75, seed=None, axes=(-1, )):
|
|
|
|
|
self.lower = lower
|
|
|
|
|
self.upper = upper
|
|
|
|
|
self.state = numpy.random.RandomState(seed)
|
|
|
|
@ -119,18 +129,16 @@ class BandpassPerturbation():
|
|
|
|
|
self.axes = axes
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
return "{}(lower={}, upper={})".format(
|
|
|
|
|
self.__class__.__name__, self.lower, self.upper
|
|
|
|
|
)
|
|
|
|
|
return "{}(lower={}, upper={})".format(self.__class__.__name__,
|
|
|
|
|
self.lower, self.upper)
|
|
|
|
|
|
|
|
|
|
def __call__(self, x_stft, uttid=None, train=True):
|
|
|
|
|
if not train:
|
|
|
|
|
return x_stft
|
|
|
|
|
|
|
|
|
|
if x_stft.ndim == 1:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
"Input in time-freq domain: " "(Time, Channel, Freq) or (Time, Freq)"
|
|
|
|
|
)
|
|
|
|
|
raise RuntimeError("Input in time-freq domain: "
|
|
|
|
|
"(Time, Channel, Freq) or (Time, Freq)")
|
|
|
|
|
|
|
|
|
|
ratio = self.state.uniform(self.lower, self.upper)
|
|
|
|
|
axes = [i if i >= 0 else x_stft.ndim - i for i in self.axes]
|
|
|
|
@ -142,7 +150,12 @@ class BandpassPerturbation():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VolumePerturbation():
|
|
|
|
|
def __init__(self, lower=-1.6, upper=1.6, utt2ratio=None, dbunit=True, seed=None):
|
|
|
|
|
def __init__(self,
|
|
|
|
|
lower=-1.6,
|
|
|
|
|
upper=1.6,
|
|
|
|
|
utt2ratio=None,
|
|
|
|
|
dbunit=True,
|
|
|
|
|
seed=None):
|
|
|
|
|
self.dbunit = dbunit
|
|
|
|
|
self.utt2ratio_file = utt2ratio
|
|
|
|
|
self.lower = lower
|
|
|
|
@ -168,12 +181,10 @@ class VolumePerturbation():
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
if self.utt2ratio is None:
|
|
|
|
|
return "{}(lower={}, upper={}, dbunit={})".format(
|
|
|
|
|
self.__class__.__name__, self.lower, self.upper, self.dbunit
|
|
|
|
|
)
|
|
|
|
|
self.__class__.__name__, self.lower, self.upper, self.dbunit)
|
|
|
|
|
else:
|
|
|
|
|
return '{}("{}", dbunit={})'.format(
|
|
|
|
|
self.__class__.__name__, self.utt2ratio_file, self.dbunit
|
|
|
|
|
)
|
|
|
|
|
self.__class__.__name__, self.utt2ratio_file, self.dbunit)
|
|
|
|
|
|
|
|
|
|
def __call__(self, x, uttid=None, train=True):
|
|
|
|
|
if not train:
|
|
|
|
@ -186,7 +197,7 @@ class VolumePerturbation():
|
|
|
|
|
else:
|
|
|
|
|
ratio = self.state.uniform(self.lower, self.upper)
|
|
|
|
|
if self.dbunit:
|
|
|
|
|
ratio = 10 ** (ratio / 20)
|
|
|
|
|
ratio = 10**(ratio / 20)
|
|
|
|
|
return x * ratio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -201,8 +212,7 @@ class NoiseInjection():
|
|
|
|
|
utt2ratio=None,
|
|
|
|
|
filetype="list",
|
|
|
|
|
dbunit=True,
|
|
|
|
|
seed=None,
|
|
|
|
|
):
|
|
|
|
|
seed=None, ):
|
|
|
|
|
self.utt2noise_file = utt2noise
|
|
|
|
|
self.utt2ratio_file = utt2ratio
|
|
|
|
|
self.filetype = filetype
|
|
|
|
@ -242,19 +252,16 @@ class NoiseInjection():
|
|
|
|
|
|
|
|
|
|
if utt2noise is not None and utt2ratio is not None:
|
|
|
|
|
if set(self.utt2ratio) != set(self.utt2noise):
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
"The uttids mismatch between {} and {}".format(utt2ratio, utt2noise)
|
|
|
|
|
)
|
|
|
|
|
raise RuntimeError("The uttids mismatch between {} and {}".
|
|
|
|
|
format(utt2ratio, utt2noise))
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
if self.utt2ratio is None:
|
|
|
|
|
return "{}(lower={}, upper={}, dbunit={})".format(
|
|
|
|
|
self.__class__.__name__, self.lower, self.upper, self.dbunit
|
|
|
|
|
)
|
|
|
|
|
self.__class__.__name__, self.lower, self.upper, self.dbunit)
|
|
|
|
|
else:
|
|
|
|
|
return '{}("{}", dbunit={})'.format(
|
|
|
|
|
self.__class__.__name__, self.utt2ratio_file, self.dbunit
|
|
|
|
|
)
|
|
|
|
|
self.__class__.__name__, self.utt2ratio_file, self.dbunit)
|
|
|
|
|
|
|
|
|
|
def __call__(self, x, uttid=None, train=True):
|
|
|
|
|
if not train:
|
|
|
|
@ -268,8 +275,8 @@ class NoiseInjection():
|
|
|
|
|
ratio = self.state.uniform(self.lower, self.upper)
|
|
|
|
|
|
|
|
|
|
if self.dbunit:
|
|
|
|
|
ratio = 10 ** (ratio / 20)
|
|
|
|
|
scale = ratio * numpy.sqrt((x ** 2).mean())
|
|
|
|
|
ratio = 10**(ratio / 20)
|
|
|
|
|
scale = ratio * numpy.sqrt((x**2).mean())
|
|
|
|
|
|
|
|
|
|
# 2. Get noise
|
|
|
|
|
if self.utt2noise is not None:
|
|
|
|
@ -280,16 +287,17 @@ class NoiseInjection():
|
|
|
|
|
# Randomly select the noise source
|
|
|
|
|
noise = self.state.choice(list(self.utt2noise.values()))
|
|
|
|
|
# Normalize the level
|
|
|
|
|
noise /= numpy.sqrt((noise ** 2).mean())
|
|
|
|
|
noise /= numpy.sqrt((noise**2).mean())
|
|
|
|
|
|
|
|
|
|
# Adjust the noise length
|
|
|
|
|
diff = abs(len(x) - len(noise))
|
|
|
|
|
offset = self.state.randint(0, diff)
|
|
|
|
|
if len(noise) > len(x):
|
|
|
|
|
# Truncate noise
|
|
|
|
|
noise = noise[offset : -(diff - offset)]
|
|
|
|
|
noise = noise[offset:-(diff - offset)]
|
|
|
|
|
else:
|
|
|
|
|
noise = numpy.pad(noise, pad_width=[offset, diff - offset], mode="wrap")
|
|
|
|
|
noise = numpy.pad(
|
|
|
|
|
noise, pad_width=[offset, diff - offset], mode="wrap")
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
# Generate white noise
|
|
|
|
@ -329,15 +337,14 @@ class RIRConvolve():
|
|
|
|
|
if x.ndim != 1:
|
|
|
|
|
# Must be single channel
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
"Input x must be one dimensional array, but got {}".format(x.shape)
|
|
|
|
|
)
|
|
|
|
|
"Input x must be one dimensional array, but got {}".format(
|
|
|
|
|
x.shape))
|
|
|
|
|
|
|
|
|
|
rir, rate = self.utt2rir[uttid]
|
|
|
|
|
if rir.ndim == 2:
|
|
|
|
|
# FIXME(kamo): Use chainer.convolution_1d?
|
|
|
|
|
# return [Time, Channel]
|
|
|
|
|
return numpy.stack(
|
|
|
|
|
[scipy.convolve(x, r, mode="same") for r in rir], axis=-1
|
|
|
|
|
)
|
|
|
|
|
[scipy.convolve(x, r, mode="same") for r in rir], axis=-1)
|
|
|
|
|
else:
|
|
|
|
|
return scipy.convolve(x, rir, mode="same")
|
|
|
|
|