parent
cd3617aeb4
commit
b07ee84a1d
@ -1,38 +1,80 @@
|
|||||||
|
"""Contains the data augmentation pipeline."""
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
from data_utils.augmentor.volumn_perturb import VolumnPerturbAugmentor
|
from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
|
||||||
|
|
||||||
|
|
||||||
class AugmentationPipeline(object):
|
class AugmentationPipeline(object):
|
||||||
|
"""Build a pre-processing pipeline with various augmentation models.Such a
|
||||||
|
data augmentation pipeline is oftern leveraged to augment the training
|
||||||
|
samples to make the model invariant to certain types of perturbations in the
|
||||||
|
real world, improving model's generalization ability.
|
||||||
|
|
||||||
|
The pipeline is built according the the augmentation configuration in json
|
||||||
|
string, e.g.
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
'[{"type": "volume",
|
||||||
|
"params": {"min_gain_dBFS": -15,
|
||||||
|
"max_gain_dBFS": 15},
|
||||||
|
"prob": 0.5},
|
||||||
|
{"type": "speed",
|
||||||
|
"params": {"min_speed_rate": 0.8,
|
||||||
|
"max_speed_rate": 1.2},
|
||||||
|
"prob": 0.5}
|
||||||
|
]'
|
||||||
|
|
||||||
|
This augmentation configuration inserts two augmentation models
|
||||||
|
into the pipeline, with one is VolumePerturbAugmentor and the other
|
||||||
|
SpeedPerturbAugmentor. "prob" indicates the probability of the current
|
||||||
|
augmentor to take effect.
|
||||||
|
|
||||||
|
:param augmentation_config: Augmentation configuration in json string.
|
||||||
|
:type augmentation_config: str
|
||||||
|
:param random_seed: Random seed.
|
||||||
|
:type random_seed: int
|
||||||
|
:raises ValueError: If the augmentation json config is in incorrect format".
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, augmentation_config, random_seed=0):
|
def __init__(self, augmentation_config, random_seed=0):
|
||||||
self._rng = random.Random(random_seed)
|
self._rng = random.Random(random_seed)
|
||||||
self._augmentors, self._rates = self._parse_pipeline_from(
|
self._augmentors, self._rates = self._parse_pipeline_from(
|
||||||
augmentation_config)
|
augmentation_config)
|
||||||
|
|
||||||
def transform_audio(self, audio_segment):
|
def transform_audio(self, audio_segment):
|
||||||
|
"""Run the pre-processing pipeline for data augmentation.
|
||||||
|
|
||||||
|
Note that this is an in-place transformation.
|
||||||
|
|
||||||
|
:param audio_segment: Audio segment to process.
|
||||||
|
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||||
|
"""
|
||||||
for augmentor, rate in zip(self._augmentors, self._rates):
|
for augmentor, rate in zip(self._augmentors, self._rates):
|
||||||
if self._rng.uniform(0., 1.) <= rate:
|
if self._rng.uniform(0., 1.) <= rate:
|
||||||
augmentor.transform_audio(audio_segment)
|
augmentor.transform_audio(audio_segment)
|
||||||
|
|
||||||
def _parse_pipeline_from(self, config_json):
|
def _parse_pipeline_from(self, config_json):
|
||||||
|
"""Parse the config json to build a augmentation pipelien."""
|
||||||
try:
|
try:
|
||||||
configs = json.loads(config_json)
|
configs = json.loads(config_json)
|
||||||
|
augmentors = [
|
||||||
|
self._get_augmentor(config["type"], config["params"])
|
||||||
|
for config in configs
|
||||||
|
]
|
||||||
|
rates = [config["prob"] for config in configs]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError("Augmentation config json format error: "
|
raise ValueError("Failed to parse the augmentation config json: "
|
||||||
"%s" % str(e))
|
"%s" % str(e))
|
||||||
augmentors = [
|
|
||||||
self._get_augmentor(config["type"], config["params"])
|
|
||||||
for config in configs
|
|
||||||
]
|
|
||||||
rates = [config["rate"] for config in configs]
|
|
||||||
return augmentors, rates
|
return augmentors, rates
|
||||||
|
|
||||||
def _get_augmentor(self, augmentor_type, params):
|
def _get_augmentor(self, augmentor_type, params):
|
||||||
if augmentor_type == "volumn":
|
"""Return an augmentation model by the type name, and pass in params."""
|
||||||
return VolumnPerturbAugmentor(self._rng, **params)
|
if augmentor_type == "volume":
|
||||||
|
return VolumePerturbAugmentor(self._rng, **params)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
|
raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
|
||||||
|
@ -0,0 +1,40 @@
|
|||||||
|
"""Contains the volume perturb augmentation model."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from data_utils.augmentor.base import AugmentorBase
|
||||||
|
|
||||||
|
|
||||||
|
class VolumePerturbAugmentor(AugmentorBase):
|
||||||
|
"""Augmentation model for adding random volume perturbation.
|
||||||
|
|
||||||
|
This is used for multi-loudness training of PCEN. See
|
||||||
|
|
||||||
|
https://arxiv.org/pdf/1607.05666v1.pdf
|
||||||
|
|
||||||
|
for more details.
|
||||||
|
|
||||||
|
:param rng: Random generator object.
|
||||||
|
:type rng: random.Random
|
||||||
|
:param min_gain_dBFS: Minimal gain in dBFS.
|
||||||
|
:type min_gain_dBFS: float
|
||||||
|
:param max_gain_dBFS: Maximal gain in dBFS.
|
||||||
|
:type max_gain_dBFS: float
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
|
||||||
|
self._min_gain_dBFS = min_gain_dBFS
|
||||||
|
self._max_gain_dBFS = max_gain_dBFS
|
||||||
|
self._rng = rng
|
||||||
|
|
||||||
|
def transform_audio(self, audio_segment):
|
||||||
|
"""Change audio loadness.
|
||||||
|
|
||||||
|
Note that this is an in-place transformation.
|
||||||
|
|
||||||
|
:param audio_segment: Audio segment to add effects to.
|
||||||
|
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||||
|
"""
|
||||||
|
gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS)
|
||||||
|
audio_segment.apply_gain(gain)
|
@ -1,17 +0,0 @@
|
|||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import random
|
|
||||||
from data_utils.augmentor.base import AugmentorBase
|
|
||||||
|
|
||||||
|
|
||||||
class VolumnPerturbAugmentor(AugmentorBase):
|
|
||||||
def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
|
|
||||||
self._min_gain_dBFS = min_gain_dBFS
|
|
||||||
self._max_gain_dBFS = max_gain_dBFS
|
|
||||||
self._rng = rng
|
|
||||||
|
|
||||||
def transform_audio(self, audio_segment):
|
|
||||||
gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS)
|
|
||||||
audio_segment.apply_gain(gain)
|
|
@ -0,0 +1,75 @@
|
|||||||
|
"""Contains the speech segment class."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from data_utils.audio import AudioSegment
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechSegment(AudioSegment):
|
||||||
|
"""Speech segment abstraction, a subclass of AudioSegment,
|
||||||
|
with an additional transcript.
|
||||||
|
|
||||||
|
:param samples: Audio samples [num_samples x num_channels].
|
||||||
|
:type samples: ndarray.float32
|
||||||
|
:param sample_rate: Audio sample rate.
|
||||||
|
:type sample_rate: int
|
||||||
|
:param transcript: Transcript text for the speech.
|
||||||
|
:type transript: basestring
|
||||||
|
:raises TypeError: If the sample data type is not float or int.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, samples, sample_rate, transcript):
|
||||||
|
AudioSegment.__init__(self, samples, sample_rate)
|
||||||
|
self._transcript = transcript
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
"""Return whether two objects are equal.
|
||||||
|
"""
|
||||||
|
if not AudioSegment.__eq__(self, other):
|
||||||
|
return False
|
||||||
|
if self._transcript != other._transcript:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __ne__(self, other):
|
||||||
|
"""Return whether two objects are unequal."""
|
||||||
|
return not self.__eq__(other)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_file(cls, filepath, transcript):
|
||||||
|
"""Create speech segment from audio file and corresponding transcript.
|
||||||
|
|
||||||
|
:param filepath: Filepath or file object to audio file.
|
||||||
|
:type filepath: basestring|file
|
||||||
|
:param transcript: Transcript text for the speech.
|
||||||
|
:type transript: basestring
|
||||||
|
:return: Audio segment instance.
|
||||||
|
:rtype: AudioSegment
|
||||||
|
"""
|
||||||
|
audio = AudioSegment.from_file(filepath)
|
||||||
|
return cls(audio.samples, audio.sample_rate, transcript)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_bytes(cls, bytes, transcript):
|
||||||
|
"""Create speech segment from a byte string and corresponding
|
||||||
|
transcript.
|
||||||
|
|
||||||
|
:param bytes: Byte string containing audio samples.
|
||||||
|
:type bytes: str
|
||||||
|
:param transcript: Transcript text for the speech.
|
||||||
|
:type transript: basestring
|
||||||
|
:return: Audio segment instance.
|
||||||
|
:rtype: AudioSegment
|
||||||
|
"""
|
||||||
|
audio = AudioSegment.from_bytes(bytes)
|
||||||
|
return cls(audio.samples, audio.sample_rate, transcript)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def transcript(self):
|
||||||
|
"""Return the transcript text.
|
||||||
|
|
||||||
|
:return: Transcript text for the speech.
|
||||||
|
:rtype: basestring
|
||||||
|
"""
|
||||||
|
return self._transcript
|
Loading…
Reference in new issue