parent
cd3617aeb4
commit
b07ee84a1d
@ -1,38 +1,80 @@
|
||||
"""Contains the data augmentation pipeline."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
import random
|
||||
from data_utils.augmentor.volumn_perturb import VolumnPerturbAugmentor
|
||||
from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
|
||||
|
||||
|
||||
class AugmentationPipeline(object):
|
||||
"""Build a pre-processing pipeline with various augmentation models.Such a
|
||||
data augmentation pipeline is oftern leveraged to augment the training
|
||||
samples to make the model invariant to certain types of perturbations in the
|
||||
real world, improving model's generalization ability.
|
||||
|
||||
The pipeline is built according the the augmentation configuration in json
|
||||
string, e.g.
|
||||
|
||||
.. code-block::
|
||||
|
||||
'[{"type": "volume",
|
||||
"params": {"min_gain_dBFS": -15,
|
||||
"max_gain_dBFS": 15},
|
||||
"prob": 0.5},
|
||||
{"type": "speed",
|
||||
"params": {"min_speed_rate": 0.8,
|
||||
"max_speed_rate": 1.2},
|
||||
"prob": 0.5}
|
||||
]'
|
||||
|
||||
This augmentation configuration inserts two augmentation models
|
||||
into the pipeline, with one is VolumePerturbAugmentor and the other
|
||||
SpeedPerturbAugmentor. "prob" indicates the probability of the current
|
||||
augmentor to take effect.
|
||||
|
||||
:param augmentation_config: Augmentation configuration in json string.
|
||||
:type augmentation_config: str
|
||||
:param random_seed: Random seed.
|
||||
:type random_seed: int
|
||||
:raises ValueError: If the augmentation json config is in incorrect format".
|
||||
"""
|
||||
|
||||
def __init__(self, augmentation_config, random_seed=0):
|
||||
self._rng = random.Random(random_seed)
|
||||
self._augmentors, self._rates = self._parse_pipeline_from(
|
||||
augmentation_config)
|
||||
|
||||
def transform_audio(self, audio_segment):
|
||||
"""Run the pre-processing pipeline for data augmentation.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param audio_segment: Audio segment to process.
|
||||
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||
"""
|
||||
for augmentor, rate in zip(self._augmentors, self._rates):
|
||||
if self._rng.uniform(0., 1.) <= rate:
|
||||
augmentor.transform_audio(audio_segment)
|
||||
|
||||
def _parse_pipeline_from(self, config_json):
|
||||
"""Parse the config json to build a augmentation pipelien."""
|
||||
try:
|
||||
configs = json.loads(config_json)
|
||||
augmentors = [
|
||||
self._get_augmentor(config["type"], config["params"])
|
||||
for config in configs
|
||||
]
|
||||
rates = [config["prob"] for config in configs]
|
||||
except Exception as e:
|
||||
raise ValueError("Augmentation config json format error: "
|
||||
raise ValueError("Failed to parse the augmentation config json: "
|
||||
"%s" % str(e))
|
||||
augmentors = [
|
||||
self._get_augmentor(config["type"], config["params"])
|
||||
for config in configs
|
||||
]
|
||||
rates = [config["rate"] for config in configs]
|
||||
return augmentors, rates
|
||||
|
||||
def _get_augmentor(self, augmentor_type, params):
|
||||
if augmentor_type == "volumn":
|
||||
return VolumnPerturbAugmentor(self._rng, **params)
|
||||
"""Return an augmentation model by the type name, and pass in params."""
|
||||
if augmentor_type == "volume":
|
||||
return VolumePerturbAugmentor(self._rng, **params)
|
||||
else:
|
||||
raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
|
||||
|
@ -0,0 +1,40 @@
|
||||
"""Contains the volume perturb augmentation model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from data_utils.augmentor.base import AugmentorBase
|
||||
|
||||
|
||||
class VolumePerturbAugmentor(AugmentorBase):
|
||||
"""Augmentation model for adding random volume perturbation.
|
||||
|
||||
This is used for multi-loudness training of PCEN. See
|
||||
|
||||
https://arxiv.org/pdf/1607.05666v1.pdf
|
||||
|
||||
for more details.
|
||||
|
||||
:param rng: Random generator object.
|
||||
:type rng: random.Random
|
||||
:param min_gain_dBFS: Minimal gain in dBFS.
|
||||
:type min_gain_dBFS: float
|
||||
:param max_gain_dBFS: Maximal gain in dBFS.
|
||||
:type max_gain_dBFS: float
|
||||
"""
|
||||
|
||||
def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
|
||||
self._min_gain_dBFS = min_gain_dBFS
|
||||
self._max_gain_dBFS = max_gain_dBFS
|
||||
self._rng = rng
|
||||
|
||||
def transform_audio(self, audio_segment):
|
||||
"""Change audio loadness.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param audio_segment: Audio segment to add effects to.
|
||||
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||
"""
|
||||
gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS)
|
||||
audio_segment.apply_gain(gain)
|
@ -1,17 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import random
|
||||
from data_utils.augmentor.base import AugmentorBase
|
||||
|
||||
|
||||
class VolumnPerturbAugmentor(AugmentorBase):
|
||||
def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
|
||||
self._min_gain_dBFS = min_gain_dBFS
|
||||
self._max_gain_dBFS = max_gain_dBFS
|
||||
self._rng = rng
|
||||
|
||||
def transform_audio(self, audio_segment):
|
||||
gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS)
|
||||
audio_segment.apply_gain(gain)
|
@ -0,0 +1,75 @@
|
||||
"""Contains the speech segment class."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from data_utils.audio import AudioSegment
|
||||
|
||||
|
||||
class SpeechSegment(AudioSegment):
|
||||
"""Speech segment abstraction, a subclass of AudioSegment,
|
||||
with an additional transcript.
|
||||
|
||||
:param samples: Audio samples [num_samples x num_channels].
|
||||
:type samples: ndarray.float32
|
||||
:param sample_rate: Audio sample rate.
|
||||
:type sample_rate: int
|
||||
:param transcript: Transcript text for the speech.
|
||||
:type transript: basestring
|
||||
:raises TypeError: If the sample data type is not float or int.
|
||||
"""
|
||||
|
||||
def __init__(self, samples, sample_rate, transcript):
|
||||
AudioSegment.__init__(self, samples, sample_rate)
|
||||
self._transcript = transcript
|
||||
|
||||
def __eq__(self, other):
|
||||
"""Return whether two objects are equal.
|
||||
"""
|
||||
if not AudioSegment.__eq__(self, other):
|
||||
return False
|
||||
if self._transcript != other._transcript:
|
||||
return False
|
||||
return True
|
||||
|
||||
def __ne__(self, other):
|
||||
"""Return whether two objects are unequal."""
|
||||
return not self.__eq__(other)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, filepath, transcript):
|
||||
"""Create speech segment from audio file and corresponding transcript.
|
||||
|
||||
:param filepath: Filepath or file object to audio file.
|
||||
:type filepath: basestring|file
|
||||
:param transcript: Transcript text for the speech.
|
||||
:type transript: basestring
|
||||
:return: Audio segment instance.
|
||||
:rtype: AudioSegment
|
||||
"""
|
||||
audio = AudioSegment.from_file(filepath)
|
||||
return cls(audio.samples, audio.sample_rate, transcript)
|
||||
|
||||
@classmethod
|
||||
def from_bytes(cls, bytes, transcript):
|
||||
"""Create speech segment from a byte string and corresponding
|
||||
transcript.
|
||||
|
||||
:param bytes: Byte string containing audio samples.
|
||||
:type bytes: str
|
||||
:param transcript: Transcript text for the speech.
|
||||
:type transript: basestring
|
||||
:return: Audio segment instance.
|
||||
:rtype: AudioSegment
|
||||
"""
|
||||
audio = AudioSegment.from_bytes(bytes)
|
||||
return cls(audio.samples, audio.sample_rate, transcript)
|
||||
|
||||
@property
|
||||
def transcript(self):
|
||||
"""Return the transcript text.
|
||||
|
||||
:return: Transcript text for the speech.
|
||||
:rtype: basestring
|
||||
"""
|
||||
return self._transcript
|
Loading…
Reference in new issue