From 65e34c535b4444c42c28f14b16a2617a73d296d1 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Thu, 15 Jun 2017 03:08:30 +0800 Subject: [PATCH 01/11] add augmentation --- data_utils/audio.py | 396 ++++++++++++++++- data_utils/augmentor/audio_database.py | 401 ++++++++++++++++++ data_utils/augmentor/augmentation.py | 15 + data_utils/augmentor/implus_response.py | 76 ++++ data_utils/augmentor/noise_speech.py | 318 ++++++++++++++ .../online_bayesian_normalization.py | 57 +++ data_utils/augmentor/resampler.py | 30 ++ data_utils/augmentor/speed_perturb.py | 53 +++ data_utils/augmentor/volume_perturb.py | 4 +- 9 files changed, 1337 insertions(+), 13 deletions(-) create mode 100755 data_utils/augmentor/audio_database.py create mode 100755 data_utils/augmentor/implus_response.py create mode 100755 data_utils/augmentor/noise_speech.py create mode 100755 data_utils/augmentor/online_bayesian_normalization.py create mode 100755 data_utils/augmentor/resampler.py create mode 100755 data_utils/augmentor/speed_perturb.py diff --git a/data_utils/audio.py b/data_utils/audio.py index 916c8ac1..aef13c30 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -6,6 +6,8 @@ from __future__ import print_function import numpy as np import io import soundfile +import scikits.samplerate +from scipy import signal class AudioSegment(object): @@ -62,6 +64,69 @@ class AudioSegment(object): samples, sample_rate = soundfile.read(file, dtype='float32') return cls(samples, sample_rate) + @classmethod + def slice_from_file(cls, fname, start=None, end=None): + """ + Loads a small section of an audio without having to load + the entire file into the memory which can be incredibly wasteful. + + :param fname: input audio file name + :type fname: bsaestring + :param start: start time in seconds (supported granularity is ms) + If start is negative, it wraps around from the end. If not + provided, this function reads from the very beginning. + :type start: float + :param end: start time in seconds (supported granularity is ms) + If end is negative, it wraps around from the end. If not + provided, the default behvaior is to read to the end of the + file. + :type end: float + + :return:the specified slice of input audio in the audio.AudioSegment + format. + """ + sndfile = soundfile.SoundFile(fname) + + sample_rate = sndfile.samplerate + if sndfile.channels != 1: + raise TypeError("{} has more than 1 channel.".format(fname)) + + duration = float(len(sndfile)) / sample_rate + + if start is None: + start = 0.0 + if end is None: + end = duration + + if start < 0.0: + start += duration + if end < 0.0: + end += duration + + if start < 0.0: + raise IndexError("The slice start position ({} s) is out of " + "bounds. Filename: {}".format(start, fname)) + if end < 0.0: + raise IndexError("The slice end position ({} s) is out of bounds " + "Filename: {}".format(end, fname)) + + if start > end: + raise IndexError("The slice start position ({} s) is later than " + "the slice end position ({} s)." + .format(start, end)) + + if end > duration: + raise ValueError("The slice end time ({} s) is out of " + "bounds (> {} s) Filename: {}" + .format(end, duration, fname)) + + start_frame = int(start * sample_rate) + end_frame = int(end * sample_rate) + sndfile.seek(start_frame) + data = sndfile.read(frames=end_frame - start_frame, dtype='float32') + + return cls(data, sample_rate) + @classmethod def from_bytes(cls, bytes): """Create audio segment from a byte string containing audio samples. @@ -75,6 +140,44 @@ class AudioSegment(object): io.BytesIO(bytes), dtype='float32') return cls(samples, sample_rate) + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent audio segment of the given duration and + sample rate. + + :param duration: length of silence in seconds + :type duration: scalar + :param sample_rate: sample rate + :type sample_rate: scalar + :returns: silence of the given duration + :rtype: AudioSegment + """ + samples = np.zeros(int(float(duration) * sample_rate)) + return cls(samples, sample_rate) + + @classmethod + def concatenate(cls, *segments): + """Concatenate an arbitrary number of audio segments together. + + :param *segments: input audio segments + :type *segments: [AudioSegment] + """ + # Perform basic sanity-checks. + N = len(segments) + if N == 0: + raise ValueError("No audio segments are given to concatenate.") + sample_rate = segments[0]._sample_rate + for segment in segments: + if sample_rate != segment._sample_rate: + raise ValueError("Can't concatenate segments with " + "different sample rates") + if type(segment) is not cls: + raise TypeError("Only audio segments of the same type " + "instance can be concatenated.") + + samples = np.concatenate([seg.samples for seg in segments]) + return cls(samples, sample_rate) + def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. @@ -143,23 +246,288 @@ class AudioSegment(object): new_indices = np.linspace(start=0, stop=old_length, num=new_length) self._samples = np.interp(new_indices, old_indices, self._samples) - def normalize(self, target_sample_rate): - raise NotImplementedError() + def normalize(self, target_db=-20, max_gain_db=300.0): + """Normalize audio to desired RMS value in decibels. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels.This value + should be less than 0.0 as 0.0 is full-scale audio. + :type target_db: float, optional + :param max_gain_db: Max amount of gain in dB that can be applied + for normalization. This is to prevent nans when attempting + to normalize a signal consisting of all zeros. + :type max_gain_db: float, optional - def resample(self, target_sample_rate): - raise NotImplementedError() + :raises NormalizationWarning: if the required gain to normalize the + segment to the target_db value exceeds max_gain_db. + """ + gain = target_db - self.rms_db + if gain > max_gain_db: + raise ValueError( + "Unable to normalize segment to {} dB because it has an RMS " + "value of {} dB and the difference exceeds max_gain_db ({} dB)" + .format(target_db, self.rms_db, max_gain_db)) + gain = min(max_gain_db, target_db - self.rms_db) + self.apply_gain(gain) + + def normalize_online_bayesian(self, + target_db, + prior_db, + prior_samples, + startup_delay=0.0): + """ + Normalize audio using a production-compatible online/causal algorithm. + This uses an exponential likelihood and gamma prior to make + online estimates of the RMS even when there are very few samples. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels + :type target_bd: scalar + :param prior_db: Prior RMS estimate in decibels + :type prior_db: scalar + :param prior_samples: Prior strength in number of samples + :type prior_samples: scalar + :param startup_delay: Default: 0.0 s. If provided, this + function will accrue statistics for the first startup_delay + seconds before applying online normalization. + :type startup_delay: scalar + """ + # Estimate total RMS online + startup_sample_idx = min(self.num_samples - 1, + int(self.sample_rate * startup_delay)) + prior_mean_squared = 10.**(prior_db / 10.) + prior_sum_of_squares = prior_mean_squared * prior_samples + cumsum_of_squares = np.cumsum(self.samples**2) + sample_count = np.arange(len(self)) + 1 + if startup_sample_idx > 0: + cumsum_of_squares[:startup_sample_idx] = \ + cumsum_of_squares[startup_sample_idx] + sample_count[:startup_sample_idx] = \ + sample_count[startup_sample_idx] + mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / + (sample_count + prior_samples)) + rms_estimate_db = 10 * np.log10(mean_squared_estimate) + + # Compute required time-varying gain + gain_db = target_db - rms_estimate_db + + # Apply gain to new segment + self.apply_gain(gain_db) + + def normalize_ewma(self, + target_db, + decay_rate, + startup_delay, + rms_eps=1e-6, + max_gain_db=300.0): + startup_sample_idx = min(self.num_samples - 1, + int(self.sample_rate * startup_delay)) + mean_sq = self.samples**2 + if startup_sample_idx > 0: + mean_sq[:startup_sample_idx] = \ + np.sum(mean_sq[:startup_sample_idx]) / startup_sample_idx + idx_start = max(0, startup_sample_idx - 1) + initial_condition = mean_sq[idx_start] * decay_rate + mean_sq[idx_start:] = lfilter( + [1.0 - decay_rate], [1.0, -decay_rate], + mean_sq[idx_start:], + axis=0, + zi=[initial_condition])[0] + rms_estimate_db = 10.0 * np.log10(mean_sq + rms_eps) + gain_db = target_db - rms_estimate_db + if np.any(gain_db > max_gain_db): + warnings.warn( + "Unable to normalize segment to {} dB because it has an RMS " + "value of {} dB and the difference exceeds max_gain_db ({} dB)" + .format(target_db, self.rms_db, max_gain_db), + NormalizationWarning) + gain_db = np.minimum(gain_db, max_gain_db) + self.apply_gain(gain_db) + + def resample(self, target_sample_rate, quality='sinc_medium'): + """Resample audio and return new AudioSegment. + This resamples the audio to a new sample rate and returns a brand + new AudioSegment. The existing AudioSegment is unchanged. + + Note that this is an in-place transformation. + + :param new_sample_rate: target sample rate + :type new_sample_rate: scalar + :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. + Sets resampling speed/quality tradeoff. + See http://www.mega-nerd.com/SRC/api_misc.html#Converters + :type quality: basestring + """ + resample_ratio = target_sample_rate / self._sample_rate + new_samples = scikits.samplerate.resample( + self._samples, r=resample_ratio, type=quality) + self._samples = new_samples + self._sample_rate = new_sample_rate def pad_silence(self, duration, sides='both'): - raise NotImplementedError() + """Pads this audio sample with a period of silence. + + Note that this is an in-place transformation. + + :param duration: length of silence in seconds to pad + :type duration: float + :param sides: + 'beginning' - adds silence in the beginning + 'end' - adds silence in the end + 'both' - adds silence in both the beginning and the end. + :type sides: basestring + """ + if duration == 0.0: + return self + cls = type(self) + silence = cls.make_silence(duration, self._sample_rate) + if sides == "beginning": + padded = cls.concatenate(silence, self) + elif sides == "end": + padded = cls.concatenate(self, silence) + elif sides == "both": + padded = cls.concatenate(silence, self, silence) + else: + raise ValueError("Unknown value for the kwarg 'sides'") + self._samples = padded._samples + self._sample_rate = padded._sample_rate def subsegment(self, start_sec=None, end_sec=None): - raise NotImplementedError() + """Return new AudioSegment containing audio between given boundaries. + + :param start_sec: Beginning of subsegment in seconds, + (beginning of segment if None). + :type start_sec: scalar + :param end_sec: End of subsegment in seconds, + (end of segment if None). + :type end_sec: scalar + + :return: New AudioSegment containing specified + subsegment. + :trype: AudioSegment + """ + # Default boundaries + if start_sec is None: + start_sec = 0.0 + if end_sec is None: + end_sec = self.duration + + # negative boundaries are relative to end of segment + if start_sec < 0.0: + start_sec = self.duration + start_sec + if end_sec < 0.0: + end_sec = self.duration + end_sec - def convolve(self, filter, allow_resample=False): - raise NotImplementedError() + start_sample = int(round(start_sec * self._sample_rate)) + end_sample = int(round(end_sec * self._sample_rate)) + samples = self._samples[start_sample:end_sample] - def convolve_and_normalize(self, filter, allow_resample=False): - raise NotImplementedError() + return type(self)(samples, sample_rate=self._sample_rate) + + def random_subsegment(self, subsegment_length, rng=None): + """ + Return a random subsegment of a specified length in seconds. + + :param subsegment_length: Subsegment length in seconds. + :type subsegment_length: scalar + :param rng: Random number generator state + :type rng: random.Random [optional] + + + :return:clip (SpeechDLSegment): New SpeechDLSegmen containing random + subsegment of original segment. + """ + if rng is None: + rng = random.Random() + + if subsegment_length > self.duration: + raise ValueError("Length of subsegment must not be greater " + "than original segment.") + start_time = rng.uniform(0.0, self.duration - subsegment_length) + return self.subsegment(start_time, start_time + subsegment_length) + + def convolve(self, ir, allow_resampling=False): + """Convolve this audio segment with the given filter. + + :param ir: impulse response + :type ir: AudioSegment + :param allow_resampling: indicates whether resampling is allowed + when the ir has a different sample rate from this signal. + :type allow_resampling: boolean + """ + if allow_resampling and self.sample_rate != ir.sample_rate: + ir = ir.resample(self.sample_rate) + + if self.sample_rate != ir.sample_rate: + raise ValueError("Impulse response sample rate ({}Hz) is " + "equal to base signal sample rate ({}Hz)." + .format(ir.sample_rate, self.sample_rate)) + + samples = signal.fftconvolve(self.samples, ir.samples, "full") + self._samples = samples + + def convolve_and_normalize(self, ir, allow_resample=False): + """Convolve and normalize the resulting audio segment so that it + has the same average power as the input signal. + + :param ir: impulse response + :type ir: AudioSegment + :param allow_resampling: indicates whether resampling is allowed + when the ir has a different sample rate from this signal. + :type allow_resampling: boolean + """ + self.convolve(ir, allow_resampling=allow_resampling) + self.normalize(target_db=self.rms_db) + + def add_noise(self, + noise, + snr_dB, + allow_downsampling=False, + max_gain_db=300.0, + rng=None): + """Adds the given noise segment at a specific signal-to-noise ratio. + If the noise segment is longer than this segment, a random subsegment + of matching length is sampled from it and used instead. + + :param noise: Noise signal to add. + :type noise: SpeechDLSegment + :param snr_dB: Signal-to-Noise Ratio, in decibels. + :type snr_dB: scalar + :param allow_downsampling: whether to allow the noise signal + to be downsampled to match the base signal sample rate. + :type allow_downsampling: boolean + :param max_gain_db: Maximum amount of gain to apply to noise + signal before adding it in. This is to prevent attempting + to apply infinite gain to a zero signal. + :type max_gain_db: scalar + :param rng: Random number generator state. + :type rng: random.Random + + Returns: + SpeechDLSegment: signal with noise added. + """ + if rng is None: + rng = random.Random() + + if allow_downsampling and noise.sample_rate > self.sample_rate: + noise = noise.resample(self.sample_rate) + + if noise.sample_rate != self.sample_rate: + raise ValueError("Noise sample rate ({}Hz) is not equal to " + "base signal sample rate ({}Hz)." + .format(noise.sample_rate, self.sample_rate)) + if noise.duration < self.duration: + raise ValueError("Noise signal ({} sec) must be at " + "least as long as base signal ({} sec)." + .format(noise.duration, self.duration)) + noise_gain_db = self.rms_db - noise.rms_db - snr_dB + noise_gain_db = min(max_gain_db, noise_gain_db) + noise_subsegment = noise.random_subsegment(self.duration, rng=rng) + output = self + self.tranform_noise(noise_subsegment, noise_gain_db) + self._samples = output._samples + self._sample_rate = output._sample_rate @property def samples(self): @@ -186,7 +554,7 @@ class AudioSegment(object): :return: Number of samples. :rtype: int """ - return self._samples.shape(0) + return self._samples.shape[0] @property def duration(self): @@ -250,3 +618,9 @@ class AudioSegment(object): else: raise TypeError("Unsupported sample type: %s." % samples.dtype) return output_samples.astype(dtype) + + def tranform_noise(self, noise_subsegment, noise_gain_db): + """ tranform noise file + """ + return type(self)(noise_subsegment._samples * (10.**( + noise_gain_db / 20.)), noise_subsegment._sample_rate) diff --git a/data_utils/augmentor/audio_database.py b/data_utils/augmentor/audio_database.py new file mode 100755 index 00000000..e41c6dd7 --- /dev/null +++ b/data_utils/augmentor/audio_database.py @@ -0,0 +1,401 @@ +from __future__ import print_function +from collections import defaultdict +import bisect +import logging +import numpy as np +import os +import random +import sys + +UNK_TAG = "" + + +def stream_audio_index(fname, UNK=UNK_TAG): + """Reads an audio index file and emits one record in the index at a time. + + :param fname: audio index path + :type fname: basestring + :param UNK: UNK token to denote that certain audios are not tagged. + :type UNK: basesring + + Yields: + idx, duration, size, relpath, tags (int, float, int, str, list(str)): + audio file id, length of the audio in seconds, size in byte, + relative path w.r.t. to the root noise directory, list of tags + """ + with open(fname) as audio_index_file: + for i, line in enumerate(audio_index_file): + tok = line.strip().split("\t") + assert len(tok) >= 4, \ + "Invalid line at line {} in file {}".format( + i + 1, audio_index_file) + idx = int(tok[0]) + duration = float(tok[1]) + # Sometimes, the duration can round down to 0.0 + assert duration >= 0.0, \ + "Invalid duration at line {} in file {}".format( + i + 1, audio_index_file) + size = int(tok[2]) + assert size > 0, \ + "Invalid size at line {} in file {}".format( + i + 1, audio_index_file) + relpath = tok[3] + if len(tok) == 4: + tags = [UNK_TAG] + else: + tags = tok[4:] + yield idx, duration, size, relpath, tags + + +def truncate_float(val, ndigits=6): + """ Truncates a floating-point value to have the desired number of + digits after the decimal point. + + :param val: input value. + :type val: float + :parma ndigits: desired number of digits. + :type ndigits: int + + :return: truncated value + :rtype: float + """ + p = 10.0**ndigits + return float(int(val * p)) / p + + +def print_audio_index(idx, duration, size, relpath, tags, file=sys.stdout): + """Prints an audio record to the index file. + + :param idx: Audio file id. + :type idx: int + :param duration: length of the audio in seconds + :type duration: float + :param size: size of the file in bytes + :type size: int + :param relpath: relative path w.r.t. to the root noise directory. + :type relpath: basestring + :parma tags: list of tags + :parma tags: list(str) + :parma file: file to which we want to write an audio record. + :type file: sys.stdout + """ + file.write("{}\t{:.6f}\t{}\t{}" + .format(idx, truncate_float(duration, ndigits=6), size, relpath)) + for tag in tags: + file.write("\t{}".format(tag)) + file.write("\n") + + +class AudioIndex(object): + """ In-memory index of audio files that do not have annotations. + This supports duration-based sampling and sampling from a target + distribution. + + Each line in the index file consists of the following fields: + (id (int), duration (float), size (int), relative path (str), + list of tags ([str])) + """ + + def __init__(self): + self.audio_dir = None + self.index_fname = None + self.tags = None + self.bin_size = 2.0 + self.clear() + + def clear(self): + """ Clears the index + + Returns: + None + """ + self.idx_to_record = {} + # The list of indices correspond to audio files whose duration is + # greater than or equal to the key. + self.duration_to_id_set = {} + self.duration_to_id_set_per_tag = defaultdict(lambda: {}) + self.duration_to_list = defaultdict(lambda: []) + self.duration_to_list_per_tag = defaultdict( + lambda: defaultdict(lambda: [])) + self.tag_to_id_set = defaultdict(lambda: set()) + self.shared_duration_bins = [] + self.id_set_complete = set() + self.id_set = set() + self.duration_bins = [] + + def has_audio(self, distr=None): + """ + :param distr: The target distribution of audio tags that we want to + match. If this is not supplied, the function simply checks that + there are some audio files. + :parma distr: dict + :return: True if there are audio files. + :rtype: boolean + """ + if distr is None: + return len(self.id_set) > 0 + else: + for tag in distr: + if tag not in self.duration_to_list_per_tag: + return False + return True + + def _load_all_records_from_disk(self, audio_dir, idx_fname, bin_size): + """Loads all audio records from the disk into memory and groups them + into chunks based on their duration and the bin_size granalarity. + + Once all the records are read, indices are built from these records + by another function so that the audio samples can be drawn efficiently. + + Updates: + self.audio_dir (path): audio root directory + self.idx_fname (path): audio database index filename + self.bin_size (float): granularity of bins + self.idx_to_record (dict): maps from the audio id to + (duration, file_size, relative_path, tags) + self.tag_to_id_set (dict): maps from the tag to + the set of id's of audios that have this tag. + self.id_set_complete (set): set of all audio id's in the index file + self.min_duration (float): minimum audio duration observed in the + index file + self.duration_bins (list): the lower bounds on the duration of + audio files falling in each bin + self.duration_to_id_set (dict): contains (k, v) where v is the set + of id's of audios whose lengths are longer than or equal to k. + (e.g. k is the duration lower bound of this bin). + self.duration_to_id_set_per_tag (dict): Something like above but + has a finer granularity mapping from the tag to + duration_to_id_set. + self.shared_duration_bins (list): list of sets where each set + contains duration lower bounds whose audio id sets are the + same. The rationale for having this is that there are a few + but extremely long audio files which lead to a lot of bins. + When the id sets do not change across various minimum duration + boundaries, we + cluster these together and make them point to the same id set + reference. + + :return: whether the records were read from the disk. The assumption is + that the audio index file on disk and the actual audio files + are constructed once and never change during training. We only + re-read when either the directory or the index file path change. + """ + if self.audio_dir == audio_dir and self.idx_fname == idx_fname and \ + self.bin_size == bin_size: + # The audio directory and/or the list of audio files + # haven't changed. No need to load the list again. + return False + + # Remember where the audio index is most recently read from. + self.audio_dir = audio_dir + self.idx_fname = idx_fname + self.bin_size = bin_size + + # Read in the idx and compute the number of bins necessary + self.clear() + rank = [] + min_duration = float('inf') + max_duration = float('-inf') + for idx, duration, file_size, relpath, tags in \ + stream_audio_index(idx_fname): + self.idx_to_record[idx] = (duration, file_size, relpath, tags) + max_duration = max(max_duration, duration) + min_duration = min(min_duration, duration) + rank.append((duration, idx)) + for tag in tags: + self.tag_to_id_set[tag].add(idx) + if len(rank) == 0: + # file is empty + raise IOError("Index file {} is empty".format(idx_fname)) + for tag in self.tag_to_id_set: + self.id_set_complete |= self.tag_to_id_set[tag] + dur = min_duration + self.min_duration = min_duration + while dur < max_duration + bin_size: + self.duration_bins.append(dur) + dur += bin_size + + # Sort in decreasing order of duration and populate + # the cumulative indices lists. + rank.sort(reverse=True) + + # These are indices for `rank` and used to keep track of whether + # there are new records to add in the current bin. + last = 0 + cur = 0 + + # The set of audios falling in the previous bin; in the case, + # where we don't find new audios for the current bin, we store + # the reference to the last set so as to conserve memory. + # This is not such a big problem if the audio duration is + # bounded by a small number like 30 seconds and the + # bin size is big enough. But, for raw freesound audios, + # some audios can be as long as a few hours! + last_audio_set = set() + + # The same but for each tag so that we can pick audios based on + # tags and also some user-specified tag distribution. + last_audio_set_per_tag = defaultdict(lambda: set()) + + # Set of lists of bins sharing the same audio sets. + shared = set() + + for i in range(len(self.duration_bins) - 1, -1, -1): + lower_bound = self.duration_bins[i] + new_audio_idxs = set() + new_audio_idxs_per_tag = defaultdict(lambda: set()) + while cur < len(rank) and rank[cur][0] >= lower_bound: + idx = rank[cur][1] + tags = self.idx_to_record[idx][3] + new_audio_idxs.add(idx) + for tag in tags: + new_audio_idxs_per_tag[tag].add(idx) + cur += 1 + # This makes certain that the same list is shared across + # different bins if no new indices are added. + if cur == last: + shared.add(lower_bound) + else: + last_audio_set = last_audio_set | new_audio_idxs + for tag in new_audio_idxs_per_tag: + last_audio_set_per_tag[tag] = \ + last_audio_set_per_tag[tag] | \ + new_audio_idxs_per_tag[tag] + if len(shared) > 0: + self.shared_duration_bins.append(shared) + shared = set([lower_bound]) + ### last_audio_set = set() should set blank + last = cur + self.duration_to_id_set[lower_bound] = last_audio_set + for tag in last_audio_set_per_tag: + self.duration_to_id_set_per_tag[lower_bound][tag] = \ + last_audio_set_per_tag[tag] + + # The last `shared` record isn't added to the `shared_duration_bins`. + self.shared_duration_bins.append(shared) + + # We make sure that the while loop above has exhausted through the + # `rank` list by checking if the `cur`rent index in `rank` equals + # the length of the array, which is the halting condition. + assert cur == len(rank) + + return True + + def _build_index_from_records(self, tag_list): + """ Uses the in-memory records read from the index file to build + an in-memory index restricted to the given tag list. + + :param tag_list: List of tags we are interested in sampling from. + :type tag_list: list(str) + + Updates: + self.id_set (set): the set of all audio id's that can be sampled. + self.duration_to_list (dict): maps from the duration lower bound + to the id's of audios longer than this duration. + self.duration_to_list_per_tag (dict): maps from the tag to + the same structure as self.duration_to_list. This is to support + sampling from a target noise distribution. + + :return: whether the index was built from scratch + """ + if self.tags == tag_list: + return False + + self.tags = tag_list + if len(tag_list) == 0: + self.id_set = self.id_set_complete + else: + self.id_set = set() + for tag in tag_list: + self.id_set |= self.tag_to_id_set[tag] + + # Next, we need to take a subset of the audio files + for shared in self.shared_duration_bins: + # All bins in `shared' have the same index lists + # so we can intersect once and set all of them to this list. + lb = list(shared)[0] + intersected = list(self.id_set & self.duration_to_id_set[lb]) + duration_to_id_set = self.duration_to_id_set_per_tag[lb] + intersected_per_tag = { + tag: self.tag_to_id_set[tag] & duration_to_id_set[tag] + for tag in duration_to_id_set + } + for bin_key in shared: + self.duration_to_list[bin_key] = intersected + for tag in intersected_per_tag: + self.duration_to_list_per_tag[tag][bin_key] = \ + intersected_per_tag[tag] + assert len(self.duration_to_list) == len(self.duration_to_id_set) + return True + + def refresh_records_from_index_file(self, + audio_dir, + idx_fname, + tag_list, + bin_size=2.0): + """ Loads the index file and populates the records + for building the internal index. + + If the audio directory or index file name has changed, the whole index + is reloaded from scratch. If only the tag_list is changed, then the + desired index is built from the complete, in-memory record. + + :param audio_dir: audio directory + :type audio_dir: basestring + :param idx_fname: audio index file name + :type idex_fname: basestring + :param tag_list: list of tags we are interested in loading; + if empty, we load all. + :type tag_list: list + :param bin_size: optional argument for controlling the granularity + of duration bins + :type bin_size: float + """ + if tag_list is None: + tag_list = [] + reloaded_records = self._load_all_records_from_disk(audio_dir, + idx_fname, bin_size) + if reloaded_records or self.tags != tag_list: + self._build_index_from_records(tag_list) + logger.info('loaded {} audio files from {}' + .format(len(self.id_set), idx_fname)) + + def sample_audio(self, duration, rng=None, distr=None): + """ Uniformly draws an audio record of at least the desired duration + + :param duration: minimum desired audio duration + :type duration: float + :param rng: random number generator + :type rng: random.Random + :param distr: target distribution of audio tags. If not provided, + :type distr: dict + all audio files are sampled uniformly at random. + + :returns: success, (duration, file_size, path) + """ + if duration < 0.0: + duration = self.min_duration + i = bisect.bisect_left(self.duration_bins, duration) + if i == len(self.duration_bins): + return False, None + bin_key = self.duration_bins[i] + if distr is None: + indices = self.duration_to_list[bin_key] + else: + # If a desired audio distribution is given, we sample from it. + if rng is None: + rng = random.Random() + nprng = np.random.RandomState(rng.getrandbits(32)) + prob_masses = distr.values() + prob_masses /= np.sum(prob_masses) + tag = nprng.choice(distr.keys(), p=prob_masses) + indices = self.duration_to_list_per_tag[tag][bin_key] + if len(indices) == 0: + return False, None + else: + if rng is None: + rng = random.Random() + # duration, file size and relative path from root + s = self.idx_to_record[rng.sample(indices, 1)[0]] + s = (s[0], s[1], os.path.join(self.audio_dir, s[2])) + return True, s diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index abe1a0ec..c0a70ad1 100755 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -6,6 +6,11 @@ from __future__ import print_function import json import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor +from data_utils.augmentor.resamler import ResamplerAugmentor +from data_utils.augmentor.speed_perturb import SpeedPerturbatioAugmentor +from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor +from data_utils.augmentor.Impulse_response import ImpulseResponseAugmentor +from data_utils.augmentor.noise_speech import NoiseSpeechAugmentor class AugmentationPipeline(object): @@ -76,5 +81,15 @@ class AugmentationPipeline(object): """Return an augmentation model by the type name, and pass in params.""" if augmentor_type == "volume": return VolumePerturbAugmentor(self._rng, **params) + if augmentor_type == "resamle": + return ResamplerAugmentor(self._rng, **params) + if augmentor_type == "speed": + return SpeedPerturbatioAugmentor(self._rng, **params) + if augmentor_type == "online_bayesian_normalization": + return OnlineBayesianNormalizationAugmentor(self._rng, **params) + if augmentor_type == "Impulse_response": + return ImpulseResponseAugmentor(self._rng, **params) + if augmentor_type == "noise_speech": + return NoiseSpeechAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/implus_response.py b/data_utils/augmentor/implus_response.py new file mode 100755 index 00000000..cc205342 --- /dev/null +++ b/data_utils/augmentor/implus_response.py @@ -0,0 +1,76 @@ +""" Impulse response""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import base +from . import audio_database +from data_utils.speech import SpeechSegment + + +class ImpulseResponseAugmentor(base.AugmentorBase): + """ Instantiates an impulse response model + + :param ir_dir: directory containing impulse responses + :type ir_dir: basestring + :param tags: optional parameter for specifying what + particular impulse responses to apply. + :type tags: list + :parm tag_distr: optional noise distribution + :type tag_distr: dict + """ + + def __init__(self, rng, ir_dir, index_file, tags=None, tag_distr=None): + # Define all required parameter maps here. + self.ir_dir = ir_dir + self.index_file = index_file + + self.tags = tags + self.tag_distr = tag_distr + + self.audio_index = audio_database.AudioIndex() + self.rng = rng + + def _init_data(self): + """ Preloads stuff from disk in an attempt (e.g. list of files, etc) + to make later loading faster. If the data configuration remains the + same, this function does nothing. + + """ + self.audio_index.refresh_records_from_index_file( + self.ir_dir, self.index_file, self.tags) + + def transform_audio(self, audio_segment): + """ Convolves the input audio with an impulse response. + + :param audio_segment: input audio + :type audio_segment: AudioSegemnt + """ + # This handles the cases where the data source or directories change. + self._init_data() + + read_size = 0 + tag_distr = self.tag_distr + if not self.audio_index.has_audio(tag_distr): + if tag_distr is None: + if not self.tags: + raise RuntimeError("The ir index does not have audio " + "files to sample from.") + else: + raise RuntimeError("The ir index does not have audio " + "files of the given tags to sample " + "from.") + else: + raise RuntimeError("The ir index does not have audio " + "files to match the target ir " + "distribution.") + else: + # Querying with a negative duration triggers the index to search + # from all impulse responses. + success, record = self.audio_index.sample_audio( + -1.0, rng=self.rng, distr=tag_distr) + if success is True: + _, read_size, ir_fname = record + ir_wav = SpeechSegment.from_file(ir_fname) + audio_segment.convolve(ir_wav, allow_resampling=True) diff --git a/data_utils/augmentor/noise_speech.py b/data_utils/augmentor/noise_speech.py new file mode 100755 index 00000000..8cf7c27b --- /dev/null +++ b/data_utils/augmentor/noise_speech.py @@ -0,0 +1,318 @@ +""" noise speech +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import logging +import numpy as np +import os +from collections import defaultdict + +from . import base +from . import audio_database +from data_utils.speech import SpeechSegment + +TURK = "turk" +USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"]) +HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0 +FIND_NOISE_MAX_ATTEMPTS = 20 + +logger = logging.getLogger(__name__) + + +def get_first_smaller(items, value): + index = bisect.bisect_left(items, value) - 1 + assert items[index] < value, \ + 'get_first_smaller failed! %d %d' % (items[index], value) + return items[index] + + +def get_first_larger(items, value): + 'Find leftmost value greater than value' + index = bisect.bisect_right(items, value) + assert index < len(items), \ + "no noise bin exists for this audio length (%f)" % value + assert items[index] > value, \ + 'get_first_larger failed! %d %d' % (items[index], value) + return items[index] + + +def _get_turk_noise_files(noise_dir, index_file): + """ Creates a map from duration => a list of noise filenames + + :param noise_dir: Directory of noise files which contains + "noise-samples-list" + :type noise_dir: basestring + :param index_file: Noise list + :type index_file: basestring + + returns:noise_files (defaultdict): A map of bins to noise files. + Each key is the duration, and the value is a list of noise + files binned to this duration. Each bin is 2 secs. + + Note: noise-samples-list should contain one line per noise (wav) file + along with its duration in milliseconds + """ + noise_files = defaultdict(list) + if not os.path.exists(index_file): + logger.error('No noise files were found at {}'.format(index_file)) + return noise_files + num_noise_files = 0 + rounded_durations = list(range(0, 65, 2)) + with open(index_file, 'r') as fl: + for line in fl: + fname = os.path.join(noise_dir, line.strip().split()[0]) + duration = float(line.strip().split()[1]) / 1000 + # bin the noise files into length bins rounded by 2 sec + bin_id = get_first_smaller(rounded_durations, duration) + noise_files[bin_id].append(fname) + num_noise_files += 1 + logger.info('Loaded {} turk noise files'.format(num_noise_files)) + return noise_files + + +class NoiseSpeechAugmentor(base.AugmentorBase): + """ Noise addition block + + :param snr_min: minimum signal-to-noise ratio + :type snr_min: float + :param snr_max: maximum signal-to-noise ratio + :type snr_max: float + :param noise_dir: root of where noise files are stored + :type noise_fir: basestring + :param index_file: index of noises of interest in noise_dir + :type index_file: basestring + :param source: select one from + - turk + - freesound + - chime + Note that this field is no longer required for the freesound + and chime + :type source: string + :param tags: optional parameter for specifying what + particular noises we want to add. See above for the available tags. + :type tags: list + :param tag_distr: optional noise distribution + :type tag_distr: dict + """ + + def __init__(self, + rng, + snr_min, + snr_max, + noise_dir, + source, + allow_downsampling=None, + index_file=None, + tags=None, + tag_distr=None): + # Define all required parameter maps here. + self.rng = rng + self.snr_min = snr_min + self.snr_max = snr_max + self.noise_dir = noise_dir + self.source = source + + self.allow_downsampling = allow_downsampling + self.index_file = index_file + self.tags = tags + self.tag_distr = tag_distr + + # When new noise sources are added, make sure to define the + # associated bookkeeping variables here. + self.turk_noise_files = [] + self.turk_noise_dir = None + self.audio_index = audio_database.AudioIndex() + + def _init_data(self): + """ Preloads stuff from disk in an attempt (e.g. list of files, etc) + to make later loading faster. If the data configuration remains the + same, this function does nothing. + + """ + noise_dir = self.noise_dir + index_file = self.index_file + source = self.source + if not index_file: + if source == TURK: + index_file = os.path.join(noise_dir, 'noise-samples-list') + logger.debug("index_file not provided; " + "defaulting to " + + index_file) + else: + if source != "": + assert source in USE_AUDIO_DATABASE_SOURCES, \ + "{} not supported by audio_database".format(source) + index_file = os.path.join(noise_dir, + "audio_index_commercial.txt") + logger.debug("index_file not provided; " + "defaulting to " + + index_file) + + if source == TURK: + if self.turk_noise_dir != noise_dir: + self.turk_noise_dir = noise_dir + self.turk_noise_files = _get_turk_noise_files(noise_dir, + index_file) + # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: + else: + if source != "": + assert source in USE_AUDIO_DATABASE_SOURCES, \ + "{} not supported by audio_database".format(source) + self.audio_index.refresh_records_from_index_file( + self.noise_dir, index_file, self.tags) + + def transform_audio(self, audio_segment): + """Adds walla noise + + :param audio_segment: Input audio + :type audio_segment: SpeechSegment + """ + # This handles the cases where the data source or directories change. + self._init_data + source = self.source + allow_downsampling = self.allow_downsampling + if source == TURK: + self._add_turk_noise(audio_segment, self.rng, allow_downsampling) + # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: + else: + self._add_noise(audio_segment, self.rng, allow_downsampling) + + def _sample_snr(self): + """ Returns a float sampled in [`self.snr_min`, `self.snr_max`] + if both `self.snr_min` and `self.snr_max` are non-zero. + """ + snr_min = self.snr_min + snr_max = self.snr_max + sampled_snr = self.rng.uniform(snr_min, snr_max) + return sampled_snr + + def _add_turk_noise(self, audio_segment, allow_downsampling): + """ Adds a turk noise to the input audio. + + :param audio_segment: input audio + :type audio_segment: audiosegment + :param allow_downsampling: indicates whether downsampling + is allowed + :type allow_downsampling: boolean + """ + read_size = 0 + if len(self.turk_noise_files) > 0: + snr = self._sample_snr(self.rng) + # Draw the noise file randomly from noise files that are + # slightly longer than the utterance + noise_bins = sorted(self.turk_noise_files.keys()) + # note some bins can be empty, so we can't just round up + # to the nearest 2-sec interval + rounded_duration = get_first_larger(noise_bins, + audio_segment.duration) + noise_fname = \ + self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0] + noise = SpeechSegment.from_wav_file(noise_fname) + logger.debug('noise_fname {}'.format(noise_fname)) + logger.debug('snr {}'.format(snr)) + read_size = len(noise) * 2 + # May throw exceptions, but this is caught by + # AudioFeaturizer.get_audio_files. + audio_segment.add_noise( + noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) + + def _add_noise(self, audio_segment, allow_downsampling): + """ Adds a noise indexed in audio_database.AudioIndex. + + :param audio_segment: input audio + :type audio_segment: SpeechSegment + :param allow_downsampling: indicates whether downsampling + is allowed + :type allow_downsampling: boolean + + Returns: + (SpeechSegment, int) + - sound with turk noise added + - number of bytes read from disk + """ + read_size = 0 + tag_distr = self.tag_distr + if not self.audio_index.has_audio(tag_distr): + if tag_distr is None: + if not self.tags: + raise RuntimeError("The noise index does not have audio " + "files to sample from.") + else: + raise RuntimeError("The noise index does not have audio " + "files of the given tags to sample " + "from.") + else: + raise RuntimeError("The noise index does not have audio " + "files to match the target noise " + "distribution.") + else: + # Compute audio segment related statistics + audio_duration = audio_segment.duration + + # Sample relevant augmentation parameters. + snr = self._sample_snr(self.rng) + + # Perhaps, we may not have a sufficiently long noise, so we need + # to search iteratively. + min_duration = audio_duration + 0.25 + for _ in range(FIND_NOISE_MAX_ATTEMPTS): + logger.debug("attempting to find noise of length " + "at least {}".format(min_duration)) + + success, record = \ + self.audio_index.sample_audio(min_duration, + rng=self.rng, + distr=tag_distr) + + if success is True: + noise_duration, read_size, noise_fname = record + + # Assert after logging so we know + # what caused augmentation to fail. + logger.debug("noise_fname {}".format(noise_fname)) + logger.debug("snr {}".format(snr)) + assert noise_duration >= min_duration + break + + # Decrease the desired minimum duration linearly. + # If the value becomes smaller than some threshold, + # we half the value instead. + if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD: + min_duration -= 2.0 + else: + min_duration *= 0.5 + + if success is False: + logger.info("Failed to find a noise file") + return + + diff_duration = audio_duration + 0.25 - noise_duration + if diff_duration >= 0.0: + # Here, the noise is shorter than the audio file, so + # we pad with zeros to make sure the noise sound is applied + # with a uniformly random shift. + noise = SpeechSegment.from_file(noise_fname) + noise = noise.pad_silence(diff_duration, sides="both") + else: + # The noise clip is at least ~25 ms longer than the audio + # segment here. + diff_duration = int(noise_duration * audio_segment.sample_rate) - \ + int(audio_duration * audio_segment.sample_rate) - \ + int(0.02 * audio_segment.sample_rate) + start = float(self.rng.randint(0, diff_duration)) / \ + audio.sample_rate + finish = min(start + audio_duration + 0.2, noise_duration) + noise = SpeechSegment.slice_from_file(noise_fname, start, + finish) + + if len(noise) < len(audio_segment): + # This is to ensure that the noise clip is at least as + # long as the audio segment. + num_samples_to_pad = len(audio_segment) - len(noise) + # Padding this amount of silence on both ends ensures that + # the placement of the noise clip is uniformly random. + silence = SpeechSegment( + np.zeros(num_samples_to_pad), audio_segment.sample_rate) + noise = SpeechSegment.concatenate(silence, noise, silence) + + audio_segment.add_noise( + noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) diff --git a/data_utils/augmentor/online_bayesian_normalization.py b/data_utils/augmentor/online_bayesian_normalization.py new file mode 100755 index 00000000..bc2d6c1b --- /dev/null +++ b/data_utils/augmentor/online_bayesian_normalization.py @@ -0,0 +1,57 @@ +""" Online bayesian normalization +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import base + + +class OnlineBayesianNormalizationAugmentor(base.AugmentorBase): + """ + Instantiates an online bayesian normalization module. + :param target_db: Target RMS value in decibels + :type target_db: func[int->scalar] + :param prior_db: Prior RMS estimate in decibels + :type prior_db: func[int->scalar] + :param prior_samples: Prior strength in number of samples + :type prior_samples: func[int->scalar] + :param startup_delay: Start-up delay in seconds during + which normalization statistics is accrued. + :type starup_delay: func[int->scalar] + """ + + def __init__(self, + rng, + target_db, + prior_db, + prior_samples, + startup_delay=base.parse_parameter_from(0.0)): + + self.target_db = target_db + self.prior_db = prior_db + self.prior_samples = prior_samples + self.startup_delay = startup_delay + self.rng = rng + + def transform_audio(self, audio_segment): + """ + Normalizes the input audio using the online Bayesian approach. + + :param audio_segment: input audio + :type audio_segment: SpeechSegment + :param iteration: current iteration + :type iteration: int + :param text: audio transcription + :type text: basestring + :param rng: RNG to use for augmentation + :type rng: random.Random + + """ + read_size = 0 + target_db = self.target_db(iteration) + prior_db = self.prior_db(iteration) + prior_samples = self.prior_samples(iteration) + startup_delay = self.startup_delay(iteration) + audio.normalize_online_bayesian( + target_db, prior_db, prior_samples, startup_delay=startup_delay) diff --git a/data_utils/augmentor/resampler.py b/data_utils/augmentor/resampler.py new file mode 100755 index 00000000..1b959be5 --- /dev/null +++ b/data_utils/augmentor/resampler.py @@ -0,0 +1,30 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import base + + +class ResamplerAugmentor(base.AugmentorBase): + """ Instantiates a resampler module. + + :param new_sample_rate: New sample rate in Hz + :type new_sample_rate: func[int->scalar] + :param rng: Random generator object. + :type rng: random.Random + """ + + def __init__(self, rng, new_sample_rate): + self.new_sample_rate = new_sample_rate + self._rng = rng + + def transform_audio(self, audio_segment): + """ Resamples the input audio to the target sample rate. + + Note that this is an in-place transformation. + + :param audio: input audio + :type audio: SpeechDLSegment + """ + new_sample_rate = self.new_sample_rate + audio.resample(new_sample_rate) \ No newline at end of file diff --git a/data_utils/augmentor/speed_perturb.py b/data_utils/augmentor/speed_perturb.py new file mode 100755 index 00000000..e09be5f7 --- /dev/null +++ b/data_utils/augmentor/speed_perturb.py @@ -0,0 +1,53 @@ +"""Speed perturbation module for making ASR robust to different voice +types (high pitched, low pitched, etc) +Samples uniformly between speed_min and speed_max +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from . import base + + +class SpeedPerturbatioAugmentor(base.AugmentorBase): + """ + Instantiates a speed perturbation module. + + See reference paper here: + + http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf + + :param speed_min: Lower bound on new rate to sample + :type speed_min: func[int->scalar] + :param speed_max: Upper bound on new rate to sample + :type speed_max: func[int->scalar] + """ + + def __init__(self, rng, speed_min, speed_max): + + if (speed_min < 0.9): + raise ValueError( + "Sampling speed below 0.9 can cause unnatural effects") + if (speed_min > 1.1): + raise ValueError( + "Sampling speed above 1.1 can cause unnatural effects") + self.speed_min = speed_min + self.speed_max = speed_max + self.rng = rng + + def transform_audio(self, audio_segment): + """ + Samples a new speed rate from the given range and + changes the speed of the given audio clip. + + Note that this is an in-place transformation. + + :param audio_segment: input audio + :type audio_segment: SpeechDLSegment + """ + read_size = 0 + speed_min = self.speed_min(iteration) + speed_max = self.speed_max(iteration) + sampled_speed = rng.uniform(speed_min, speed_max) + audio = audio.change_speed(sampled_speed) diff --git a/data_utils/augmentor/volume_perturb.py b/data_utils/augmentor/volume_perturb.py index a5a9f6ca..15055b91 100755 --- a/data_utils/augmentor/volume_perturb.py +++ b/data_utils/augmentor/volume_perturb.py @@ -3,10 +3,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from data_utils.augmentor.base import AugmentorBase +from . import base -class VolumePerturbAugmentor(AugmentorBase): +class VolumePerturbAugmentor(base.AugmentorBase): """Augmentation model for adding random volume perturbation. This is used for multi-loudness training of PCEN. See From d66d740ea0f6002e2fb48f2b5a304f76205b2fdb Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Fri, 16 Jun 2017 15:17:43 +0800 Subject: [PATCH 02/11] add audio part --- data_utils/audio.py | 457 ++++++++---------- data_utils/augmentor/audio_database.py | 401 --------------- data_utils/augmentor/augmentation.py | 15 - data_utils/augmentor/implus_response.py | 76 --- data_utils/augmentor/noise_speech.py | 318 ------------ .../online_bayesian_normalization.py | 57 --- data_utils/augmentor/resampler.py | 30 -- data_utils/augmentor/speed_perturb.py | 53 -- data_utils/augmentor/volume_perturb.py | 4 +- requirements.txt | 2 + 10 files changed, 215 insertions(+), 1198 deletions(-) delete mode 100755 data_utils/augmentor/audio_database.py delete mode 100755 data_utils/augmentor/implus_response.py delete mode 100755 data_utils/augmentor/noise_speech.py delete mode 100755 data_utils/augmentor/online_bayesian_normalization.py delete mode 100755 data_utils/augmentor/resampler.py delete mode 100755 data_utils/augmentor/speed_perturb.py diff --git a/data_utils/audio.py b/data_utils/audio.py index aef13c30..ee4e6d84 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -8,6 +8,7 @@ import io import soundfile import scikits.samplerate from scipy import signal +import random class AudioSegment(object): @@ -46,6 +47,32 @@ class AudioSegment(object): """Return whether two objects are unequal.""" return not self.__eq__(other) + def __len__(self): + """Returns length of segment in samples.""" + return self.num_samples + + def __add__(self, other): + """Add samples from another segment to those of this segment and return + a new segment (sample-wise addition, not segment concatenation). + + :param other: Segment containing samples to be + added in. + :type other: AudioSegment + :return: New segment containing resulting samples. + :rtype: AudioSegment + :raise TypeError: If sample rates of segments don't match, + or if length of segments don't match. + """ + if type(self) != type(other): + raise TypeError("Cannot add segment of different type: {}" + .format(type(other))) + if self._sample_rate != other._sample_rate: + raise TypeError("Sample rates must match to add segments.") + if len(self._samples) != len(other._samples): + raise TypeError("Segment lengths must match to add segments.") + samples = self.samples + other.samples + return type(self)(samples, sample_rate=self._sample_rate) + def __str__(self): """Return human-readable representation of segment.""" return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " @@ -64,69 +91,6 @@ class AudioSegment(object): samples, sample_rate = soundfile.read(file, dtype='float32') return cls(samples, sample_rate) - @classmethod - def slice_from_file(cls, fname, start=None, end=None): - """ - Loads a small section of an audio without having to load - the entire file into the memory which can be incredibly wasteful. - - :param fname: input audio file name - :type fname: bsaestring - :param start: start time in seconds (supported granularity is ms) - If start is negative, it wraps around from the end. If not - provided, this function reads from the very beginning. - :type start: float - :param end: start time in seconds (supported granularity is ms) - If end is negative, it wraps around from the end. If not - provided, the default behvaior is to read to the end of the - file. - :type end: float - - :return:the specified slice of input audio in the audio.AudioSegment - format. - """ - sndfile = soundfile.SoundFile(fname) - - sample_rate = sndfile.samplerate - if sndfile.channels != 1: - raise TypeError("{} has more than 1 channel.".format(fname)) - - duration = float(len(sndfile)) / sample_rate - - if start is None: - start = 0.0 - if end is None: - end = duration - - if start < 0.0: - start += duration - if end < 0.0: - end += duration - - if start < 0.0: - raise IndexError("The slice start position ({} s) is out of " - "bounds. Filename: {}".format(start, fname)) - if end < 0.0: - raise IndexError("The slice end position ({} s) is out of bounds " - "Filename: {}".format(end, fname)) - - if start > end: - raise IndexError("The slice start position ({} s) is later than " - "the slice end position ({} s)." - .format(start, end)) - - if end > duration: - raise ValueError("The slice end time ({} s) is out of " - "bounds (> {} s) Filename: {}" - .format(end, duration, fname)) - - start_frame = int(start * sample_rate) - end_frame = int(end * sample_rate) - sndfile.seek(start_frame) - data = sndfile.read(frames=end_frame - start_frame, dtype='float32') - - return cls(data, sample_rate) - @classmethod def from_bytes(cls, bytes): """Create audio segment from a byte string containing audio samples. @@ -140,43 +104,30 @@ class AudioSegment(object): io.BytesIO(bytes), dtype='float32') return cls(samples, sample_rate) - @classmethod - def make_silence(cls, duration, sample_rate): - """Creates a silent audio segment of the given duration and - sample rate. - - :param duration: length of silence in seconds - :type duration: scalar - :param sample_rate: sample rate - :type sample_rate: scalar - :returns: silence of the given duration - :rtype: AudioSegment - """ - samples = np.zeros(int(float(duration) * sample_rate)) - return cls(samples, sample_rate) - - @classmethod - def concatenate(cls, *segments): + def concatenate(self, *segments): """Concatenate an arbitrary number of audio segments together. - :param *segments: input audio segments - :type *segments: [AudioSegment] + :param *segments: Input audio segments + :type *segments: AudioSegment + :return: Audio segment instance. + :rtype: AudioSegment + :raises ValueError: If number of segments is zero, or if sample_rate + not match between two audio segments + :raises TypeError: If item of segments is not Audiosegment instance """ # Perform basic sanity-checks. - N = len(segments) - if N == 0: + if len(segments) == 0: raise ValueError("No audio segments are given to concatenate.") sample_rate = segments[0]._sample_rate - for segment in segments: - if sample_rate != segment._sample_rate: + for seg in segments: + if sample_rate != seg._sample_rate: raise ValueError("Can't concatenate segments with " "different sample rates") - if type(segment) is not cls: + if type(seg) is not type(self): raise TypeError("Only audio segments of the same type " "instance can be concatenated.") - samples = np.concatenate([seg.samples for seg in segments]) - return cls(samples, sample_rate) + return type(self)(samples, sample_rate) def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. @@ -203,6 +154,65 @@ class AudioSegment(object): format='WAV', subtype=subtype_map[dtype]) + def slice_from_file(self, file, start=None, end=None): + """Loads a small section of an audio without having to load + the entire file into the memory which can be incredibly wasteful. + + :param file: Input audio filepath + :type file: basestring + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :return: The specified slice of input audio in the audio.AudioSegment format. + :rtype: AudioSegment + :rainse ValueError: If the position is error, or if the time is out bounds. + """ + sndfile = soundfile.SoundFile(file) + sample_rate = sndfile.samplerate + duration = float(len(sndfile)) / sample_rate + start = 0. if start is None else start + end = 0. if end is None else end + if start < 0.0: + start += duration + if end < 0.0: + end += duration + if start < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds. Filename: %s" % (start, file)) + if end < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds " + "Filename: %s" % (end, file)) + if start > end: + raise ValueError("The slice start position (%f s) is later than " + "the slice end position (%f s)." % (start, end)) + if end > duration: + raise ValueError("The slice end time (%f s) is out of bounds " + "(> %f s) Filename: %s" % (end, duration, file)) + start_frame = int(start * sample_rate) + end_frame = int(end * sample_rate) + sndfile.seek(start_frame) + data = sndfile.read(frames=end_frame - start_frame, dtype='float32') + return type(self)(data, sample_rate) + + def make_silence(self, duration, sample_rate): + """Creates a silent audio segment of the given duration and + sample rate. + + :param duration: Length of silence in seconds + :type duration: float + :param sample_rate: Sample rate + :type sample_rate: float + :return: Silence of the given duration + :rtype: AudioSegment + """ + samples = np.zeros(int(duration * sample_rate)) + return type(self)(samples, sample_rate) + def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. @@ -247,52 +257,49 @@ class AudioSegment(object): self._samples = np.interp(new_indices, old_indices, self._samples) def normalize(self, target_db=-20, max_gain_db=300.0): - """Normalize audio to desired RMS value in decibels. + """Normalize audio to be desired RMS value in decibels. Note that this is an in-place transformation. - :param target_db: Target RMS value in decibels.This value - should be less than 0.0 as 0.0 is full-scale audio. - :type target_db: float, optional - :param max_gain_db: Max amount of gain in dB that can be applied - for normalization. This is to prevent nans when attempting - to normalize a signal consisting of all zeros. - :type max_gain_db: float, optional - - :raises NormalizationWarning: if the required gain to normalize the - segment to the target_db value exceeds max_gain_db. + :param target_db: Target RMS value in decibels. This value should + be less than 0.0 as 0.0 is full-scale audio. + :type target_db: float + :param max_gain_db: Max amount of gain in dB that can be applied for + normalization. This is to prevent nans when attempting + to normalize a signal consisting of all zeros. + :type max_gain_db: float + :raises ValueError: If the required gain to normalize the segment to + the target_db value exceeds max_gain_db. """ gain = target_db - self.rms_db if gain > max_gain_db: raise ValueError( - "Unable to normalize segment to {} dB because it has an RMS " - "value of {} dB and the difference exceeds max_gain_db ({} dB)" - .format(target_db, self.rms_db, max_gain_db)) - gain = min(max_gain_db, target_db - self.rms_db) - self.apply_gain(gain) + "Unable to normalize segment to %f dB because it has an RMS " + "value of %f dB and the difference exceeds max_gain_db (%f dB)" + % (target_db, self.rms_db, max_gain_db)) + self.apply_gain(min(max_gain_db, target_db - self.rms_db)) def normalize_online_bayesian(self, target_db, prior_db, prior_samples, startup_delay=0.0): - """ - Normalize audio using a production-compatible online/causal algorithm. - This uses an exponential likelihood and gamma prior to make - online estimates of the RMS even when there are very few samples. + """Normalize audio using a production-compatible online/causal algorithm. + This uses an exponential likelihood and gamma prior to make online estimates + of the RMS even when there are very few samples. Note that this is an in-place transformation. :param target_db: Target RMS value in decibels - :type target_bd: scalar + :type target_bd: float :param prior_db: Prior RMS estimate in decibels - :type prior_db: scalar + :type prior_db: float :param prior_samples: Prior strength in number of samples - :type prior_samples: scalar - :param startup_delay: Default: 0.0 s. If provided, this - function will accrue statistics for the first startup_delay - seconds before applying online normalization. - :type startup_delay: scalar + :type prior_samples: float + :param startup_delay: Default 0.0 s. If provided, this function will accrue + statistics for the first startup_delay seconds before + applying online normalization. + :type startup_delay: float """ # Estimate total RMS online startup_sample_idx = min(self.num_samples - 1, @@ -309,88 +316,54 @@ class AudioSegment(object): mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / (sample_count + prior_samples)) rms_estimate_db = 10 * np.log10(mean_squared_estimate) - # Compute required time-varying gain gain_db = target_db - rms_estimate_db - - # Apply gain to new segment - self.apply_gain(gain_db) - - def normalize_ewma(self, - target_db, - decay_rate, - startup_delay, - rms_eps=1e-6, - max_gain_db=300.0): - startup_sample_idx = min(self.num_samples - 1, - int(self.sample_rate * startup_delay)) - mean_sq = self.samples**2 - if startup_sample_idx > 0: - mean_sq[:startup_sample_idx] = \ - np.sum(mean_sq[:startup_sample_idx]) / startup_sample_idx - idx_start = max(0, startup_sample_idx - 1) - initial_condition = mean_sq[idx_start] * decay_rate - mean_sq[idx_start:] = lfilter( - [1.0 - decay_rate], [1.0, -decay_rate], - mean_sq[idx_start:], - axis=0, - zi=[initial_condition])[0] - rms_estimate_db = 10.0 * np.log10(mean_sq + rms_eps) - gain_db = target_db - rms_estimate_db - if np.any(gain_db > max_gain_db): - warnings.warn( - "Unable to normalize segment to {} dB because it has an RMS " - "value of {} dB and the difference exceeds max_gain_db ({} dB)" - .format(target_db, self.rms_db, max_gain_db), - NormalizationWarning) - gain_db = np.minimum(gain_db, max_gain_db) self.apply_gain(gain_db) def resample(self, target_sample_rate, quality='sinc_medium'): - """Resample audio and return new AudioSegment. - This resamples the audio to a new sample rate and returns a brand - new AudioSegment. The existing AudioSegment is unchanged. + """Resample audio segment. This resamples the audio to a new + sample rate. Note that this is an in-place transformation. - :param new_sample_rate: target sample rate - :type new_sample_rate: scalar + :param target_sample_rate: Target sample rate + :type target_sample_rate: int :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. - Sets resampling speed/quality tradeoff. - See http://www.mega-nerd.com/SRC/api_misc.html#Converters + Sets resampling speed/quality tradeoff. + See http://www.mega-nerd.com/SRC/api_misc.html#Converters :type quality: basestring """ resample_ratio = target_sample_rate / self._sample_rate new_samples = scikits.samplerate.resample( self._samples, r=resample_ratio, type=quality) self._samples = new_samples - self._sample_rate = new_sample_rate + self._sample_rate = target_sample_rate def pad_silence(self, duration, sides='both'): """Pads this audio sample with a period of silence. Note that this is an in-place transformation. - :param duration: length of silence in seconds to pad + :param duration: Length of silence in seconds to pad :type duration: float - :param sides: - 'beginning' - adds silence in the beginning - 'end' - adds silence in the end - 'both' - adds silence in both the beginning and the end. - :type sides: basestring + :param sides: Position for padding + 'beginning' - adds silence in the beginning + 'end' - adds silence in the end + 'both' - adds silence in both the beginning and the end. + :type sides: str + :raises ValueError: If the sides not surport """ if duration == 0.0: return self - cls = type(self) - silence = cls.make_silence(duration, self._sample_rate) + silence = self.make_silence(duration, self._sample_rate) if sides == "beginning": - padded = cls.concatenate(silence, self) + padded = self.concatenate(silence, self) elif sides == "end": - padded = cls.concatenate(self, silence) + padded = self.concatenate(self, silence) elif sides == "both": - padded = cls.concatenate(silence, self, silence) + padded = self.concatenate(silence, self, silence) else: - raise ValueError("Unknown value for the kwarg 'sides'") + raise ValueError("Unknown value for the kwarg %s" % sides) self._samples = padded._samples self._sample_rate = padded._sample_rate @@ -398,88 +371,83 @@ class AudioSegment(object): """Return new AudioSegment containing audio between given boundaries. :param start_sec: Beginning of subsegment in seconds, - (beginning of segment if None). - :type start_sec: scalar + (beginning of segment if None). + :type start_sec: float :param end_sec: End of subsegment in seconds, - (end of segment if None). - :type end_sec: scalar - - :return: New AudioSegment containing specified - subsegment. - :trype: AudioSegment + (end of segment if None). + :type end_sec: float + :return: New AudioSegment containing specified subsegment. + :rtype: AudioSegment """ - # Default boundaries - if start_sec is None: - start_sec = 0.0 - if end_sec is None: - end_sec = self.duration - + start_sec = 0.0 if start_sec is None else start_sec + end_sec = self.duration if end_sec is None else end_sec # negative boundaries are relative to end of segment if start_sec < 0.0: start_sec = self.duration + start_sec if end_sec < 0.0: end_sec = self.duration + end_sec - start_sample = int(round(start_sec * self._sample_rate)) end_sample = int(round(end_sec * self._sample_rate)) samples = self._samples[start_sample:end_sample] - return type(self)(samples, sample_rate=self._sample_rate) def random_subsegment(self, subsegment_length, rng=None): - """ - Return a random subsegment of a specified length in seconds. + """Return a random subsegment of a specified length in seconds. :param subsegment_length: Subsegment length in seconds. - :type subsegment_length: scalar + :type subsegment_length: float :param rng: Random number generator state - :type rng: random.Random [optional] - - - :return:clip (SpeechDLSegment): New SpeechDLSegmen containing random - subsegment of original segment. + :type rng: random.Random + :return: New AudioSegment containing random subsegment + of original segment + :rtype: AudioSegment + :raises ValueError: If the length of subsegment greater than origineal + segemnt. """ - if rng is None: - rng = random.Random() - + rng = random.Random() if rng is None else rng if subsegment_length > self.duration: raise ValueError("Length of subsegment must not be greater " "than original segment.") start_time = rng.uniform(0.0, self.duration - subsegment_length) return self.subsegment(start_time, start_time + subsegment_length) - def convolve(self, ir, allow_resampling=False): + def convolve(self, impulse_segment, allow_resample=False): """Convolve this audio segment with the given filter. - :param ir: impulse response - :type ir: AudioSegment - :param allow_resampling: indicates whether resampling is allowed - when the ir has a different sample rate from this signal. - :type allow_resampling: boolean - """ - if allow_resampling and self.sample_rate != ir.sample_rate: - ir = ir.resample(self.sample_rate) - - if self.sample_rate != ir.sample_rate: - raise ValueError("Impulse response sample rate ({}Hz) is " - "equal to base signal sample rate ({}Hz)." - .format(ir.sample_rate, self.sample_rate)) + Note that this is an in-place transformation. - samples = signal.fftconvolve(self.samples, ir.samples, "full") + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: boolean + :raises ValueError: If the sample rate is not match between two + audio segments and resample is not allowed. + """ + if allow_resample and self.sample_rate != impulse_segment.sample_rate: + impulse_segment = impulse_segment.resample(self.sample_rate) + if self.sample_rate != impulse_segment.sample_rate: + raise ValueError("Impulse segment's sample rate (%d Hz) is not" + "equal to base signal sample rate (%d Hz)." % + (impulse_segment.sample_rate, self.sample_rate)) + samples = signal.fftconvolve(self.samples, impulse_segment.samples, + "full") self._samples = samples - def convolve_and_normalize(self, ir, allow_resample=False): + def convolve_and_normalize(self, impulse_segment, allow_resample=False): """Convolve and normalize the resulting audio segment so that it has the same average power as the input signal. - :param ir: impulse response - :type ir: AudioSegment - :param allow_resampling: indicates whether resampling is allowed - when the ir has a different sample rate from this signal. - :type allow_resampling: boolean + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: indicates whether resampling is allowed when + the impulse_segment has a different sample rate from this signal. + :type allow_resample: boolean """ - self.convolve(ir, allow_resampling=allow_resampling) - self.normalize(target_db=self.rms_db) + target_db = self.rms_db + self.convolve(impulse_segment, allow_resample=allow_resample) + self.normalize(target_db) def add_noise(self, noise, @@ -492,36 +460,33 @@ class AudioSegment(object): of matching length is sampled from it and used instead. :param noise: Noise signal to add. - :type noise: SpeechDLSegment + :type noise: AudioSegment :param snr_dB: Signal-to-Noise Ratio, in decibels. - :type snr_dB: scalar - :param allow_downsampling: whether to allow the noise signal - to be downsampled to match the base signal sample rate. + :type snr_dB: float + :param allow_downsampling: whether to allow the noise signal to be downsampled + to match the base signal sample rate. :type allow_downsampling: boolean - :param max_gain_db: Maximum amount of gain to apply to noise - signal before adding it in. This is to prevent attempting - to apply infinite gain to a zero signal. - :type max_gain_db: scalar + :param max_gain_db: Maximum amount of gain to apply to noise signal before + adding it in. This is to prevent attempting to apply infinite + gain to a zero signal. + :type max_gain_db: float :param rng: Random number generator state. :type rng: random.Random - - Returns: - SpeechDLSegment: signal with noise added. + :raises ValueError: If the sample rate does not match between the two audio segments + and resample is not allowed, or if the duration of noise segments + is shorter than original audio segments. """ - if rng is None: - rng = random.Random() - + rng = random.Random() if rng is None else rng if allow_downsampling and noise.sample_rate > self.sample_rate: noise = noise.resample(self.sample_rate) - if noise.sample_rate != self.sample_rate: - raise ValueError("Noise sample rate ({}Hz) is not equal to " - "base signal sample rate ({}Hz)." - .format(noise.sample_rate, self.sample_rate)) + raise ValueError("Noise sample rate (%d Hz) is not equal to " + "base signal sample rate (%d Hz)." % + (noise.sample_rate, self.sample_rate)) if noise.duration < self.duration: - raise ValueError("Noise signal ({} sec) must be at " - "least as long as base signal ({} sec)." - .format(noise.duration, self.duration)) + raise ValueError("Noise signal (%f sec) must be at " + "least as long as base signal (%f sec)." % + (noise.duration, self.duration)) noise_gain_db = self.rms_db - noise.rms_db - snr_dB noise_gain_db = min(max_gain_db, noise_gain_db) noise_subsegment = noise.random_subsegment(self.duration, rng=rng) @@ -529,6 +494,12 @@ class AudioSegment(object): self._samples = output._samples self._sample_rate = output._sample_rate + def tranform_noise(self, noise_subsegment, noise_gain_db): + """ tranform noise file + """ + return type(self)(noise_subsegment._samples * (10.**( + noise_gain_db / 20.)), noise_subsegment._sample_rate) + @property def samples(self): """Return audio samples. @@ -618,9 +589,3 @@ class AudioSegment(object): else: raise TypeError("Unsupported sample type: %s." % samples.dtype) return output_samples.astype(dtype) - - def tranform_noise(self, noise_subsegment, noise_gain_db): - """ tranform noise file - """ - return type(self)(noise_subsegment._samples * (10.**( - noise_gain_db / 20.)), noise_subsegment._sample_rate) diff --git a/data_utils/augmentor/audio_database.py b/data_utils/augmentor/audio_database.py deleted file mode 100755 index e41c6dd7..00000000 --- a/data_utils/augmentor/audio_database.py +++ /dev/null @@ -1,401 +0,0 @@ -from __future__ import print_function -from collections import defaultdict -import bisect -import logging -import numpy as np -import os -import random -import sys - -UNK_TAG = "" - - -def stream_audio_index(fname, UNK=UNK_TAG): - """Reads an audio index file and emits one record in the index at a time. - - :param fname: audio index path - :type fname: basestring - :param UNK: UNK token to denote that certain audios are not tagged. - :type UNK: basesring - - Yields: - idx, duration, size, relpath, tags (int, float, int, str, list(str)): - audio file id, length of the audio in seconds, size in byte, - relative path w.r.t. to the root noise directory, list of tags - """ - with open(fname) as audio_index_file: - for i, line in enumerate(audio_index_file): - tok = line.strip().split("\t") - assert len(tok) >= 4, \ - "Invalid line at line {} in file {}".format( - i + 1, audio_index_file) - idx = int(tok[0]) - duration = float(tok[1]) - # Sometimes, the duration can round down to 0.0 - assert duration >= 0.0, \ - "Invalid duration at line {} in file {}".format( - i + 1, audio_index_file) - size = int(tok[2]) - assert size > 0, \ - "Invalid size at line {} in file {}".format( - i + 1, audio_index_file) - relpath = tok[3] - if len(tok) == 4: - tags = [UNK_TAG] - else: - tags = tok[4:] - yield idx, duration, size, relpath, tags - - -def truncate_float(val, ndigits=6): - """ Truncates a floating-point value to have the desired number of - digits after the decimal point. - - :param val: input value. - :type val: float - :parma ndigits: desired number of digits. - :type ndigits: int - - :return: truncated value - :rtype: float - """ - p = 10.0**ndigits - return float(int(val * p)) / p - - -def print_audio_index(idx, duration, size, relpath, tags, file=sys.stdout): - """Prints an audio record to the index file. - - :param idx: Audio file id. - :type idx: int - :param duration: length of the audio in seconds - :type duration: float - :param size: size of the file in bytes - :type size: int - :param relpath: relative path w.r.t. to the root noise directory. - :type relpath: basestring - :parma tags: list of tags - :parma tags: list(str) - :parma file: file to which we want to write an audio record. - :type file: sys.stdout - """ - file.write("{}\t{:.6f}\t{}\t{}" - .format(idx, truncate_float(duration, ndigits=6), size, relpath)) - for tag in tags: - file.write("\t{}".format(tag)) - file.write("\n") - - -class AudioIndex(object): - """ In-memory index of audio files that do not have annotations. - This supports duration-based sampling and sampling from a target - distribution. - - Each line in the index file consists of the following fields: - (id (int), duration (float), size (int), relative path (str), - list of tags ([str])) - """ - - def __init__(self): - self.audio_dir = None - self.index_fname = None - self.tags = None - self.bin_size = 2.0 - self.clear() - - def clear(self): - """ Clears the index - - Returns: - None - """ - self.idx_to_record = {} - # The list of indices correspond to audio files whose duration is - # greater than or equal to the key. - self.duration_to_id_set = {} - self.duration_to_id_set_per_tag = defaultdict(lambda: {}) - self.duration_to_list = defaultdict(lambda: []) - self.duration_to_list_per_tag = defaultdict( - lambda: defaultdict(lambda: [])) - self.tag_to_id_set = defaultdict(lambda: set()) - self.shared_duration_bins = [] - self.id_set_complete = set() - self.id_set = set() - self.duration_bins = [] - - def has_audio(self, distr=None): - """ - :param distr: The target distribution of audio tags that we want to - match. If this is not supplied, the function simply checks that - there are some audio files. - :parma distr: dict - :return: True if there are audio files. - :rtype: boolean - """ - if distr is None: - return len(self.id_set) > 0 - else: - for tag in distr: - if tag not in self.duration_to_list_per_tag: - return False - return True - - def _load_all_records_from_disk(self, audio_dir, idx_fname, bin_size): - """Loads all audio records from the disk into memory and groups them - into chunks based on their duration and the bin_size granalarity. - - Once all the records are read, indices are built from these records - by another function so that the audio samples can be drawn efficiently. - - Updates: - self.audio_dir (path): audio root directory - self.idx_fname (path): audio database index filename - self.bin_size (float): granularity of bins - self.idx_to_record (dict): maps from the audio id to - (duration, file_size, relative_path, tags) - self.tag_to_id_set (dict): maps from the tag to - the set of id's of audios that have this tag. - self.id_set_complete (set): set of all audio id's in the index file - self.min_duration (float): minimum audio duration observed in the - index file - self.duration_bins (list): the lower bounds on the duration of - audio files falling in each bin - self.duration_to_id_set (dict): contains (k, v) where v is the set - of id's of audios whose lengths are longer than or equal to k. - (e.g. k is the duration lower bound of this bin). - self.duration_to_id_set_per_tag (dict): Something like above but - has a finer granularity mapping from the tag to - duration_to_id_set. - self.shared_duration_bins (list): list of sets where each set - contains duration lower bounds whose audio id sets are the - same. The rationale for having this is that there are a few - but extremely long audio files which lead to a lot of bins. - When the id sets do not change across various minimum duration - boundaries, we - cluster these together and make them point to the same id set - reference. - - :return: whether the records were read from the disk. The assumption is - that the audio index file on disk and the actual audio files - are constructed once and never change during training. We only - re-read when either the directory or the index file path change. - """ - if self.audio_dir == audio_dir and self.idx_fname == idx_fname and \ - self.bin_size == bin_size: - # The audio directory and/or the list of audio files - # haven't changed. No need to load the list again. - return False - - # Remember where the audio index is most recently read from. - self.audio_dir = audio_dir - self.idx_fname = idx_fname - self.bin_size = bin_size - - # Read in the idx and compute the number of bins necessary - self.clear() - rank = [] - min_duration = float('inf') - max_duration = float('-inf') - for idx, duration, file_size, relpath, tags in \ - stream_audio_index(idx_fname): - self.idx_to_record[idx] = (duration, file_size, relpath, tags) - max_duration = max(max_duration, duration) - min_duration = min(min_duration, duration) - rank.append((duration, idx)) - for tag in tags: - self.tag_to_id_set[tag].add(idx) - if len(rank) == 0: - # file is empty - raise IOError("Index file {} is empty".format(idx_fname)) - for tag in self.tag_to_id_set: - self.id_set_complete |= self.tag_to_id_set[tag] - dur = min_duration - self.min_duration = min_duration - while dur < max_duration + bin_size: - self.duration_bins.append(dur) - dur += bin_size - - # Sort in decreasing order of duration and populate - # the cumulative indices lists. - rank.sort(reverse=True) - - # These are indices for `rank` and used to keep track of whether - # there are new records to add in the current bin. - last = 0 - cur = 0 - - # The set of audios falling in the previous bin; in the case, - # where we don't find new audios for the current bin, we store - # the reference to the last set so as to conserve memory. - # This is not such a big problem if the audio duration is - # bounded by a small number like 30 seconds and the - # bin size is big enough. But, for raw freesound audios, - # some audios can be as long as a few hours! - last_audio_set = set() - - # The same but for each tag so that we can pick audios based on - # tags and also some user-specified tag distribution. - last_audio_set_per_tag = defaultdict(lambda: set()) - - # Set of lists of bins sharing the same audio sets. - shared = set() - - for i in range(len(self.duration_bins) - 1, -1, -1): - lower_bound = self.duration_bins[i] - new_audio_idxs = set() - new_audio_idxs_per_tag = defaultdict(lambda: set()) - while cur < len(rank) and rank[cur][0] >= lower_bound: - idx = rank[cur][1] - tags = self.idx_to_record[idx][3] - new_audio_idxs.add(idx) - for tag in tags: - new_audio_idxs_per_tag[tag].add(idx) - cur += 1 - # This makes certain that the same list is shared across - # different bins if no new indices are added. - if cur == last: - shared.add(lower_bound) - else: - last_audio_set = last_audio_set | new_audio_idxs - for tag in new_audio_idxs_per_tag: - last_audio_set_per_tag[tag] = \ - last_audio_set_per_tag[tag] | \ - new_audio_idxs_per_tag[tag] - if len(shared) > 0: - self.shared_duration_bins.append(shared) - shared = set([lower_bound]) - ### last_audio_set = set() should set blank - last = cur - self.duration_to_id_set[lower_bound] = last_audio_set - for tag in last_audio_set_per_tag: - self.duration_to_id_set_per_tag[lower_bound][tag] = \ - last_audio_set_per_tag[tag] - - # The last `shared` record isn't added to the `shared_duration_bins`. - self.shared_duration_bins.append(shared) - - # We make sure that the while loop above has exhausted through the - # `rank` list by checking if the `cur`rent index in `rank` equals - # the length of the array, which is the halting condition. - assert cur == len(rank) - - return True - - def _build_index_from_records(self, tag_list): - """ Uses the in-memory records read from the index file to build - an in-memory index restricted to the given tag list. - - :param tag_list: List of tags we are interested in sampling from. - :type tag_list: list(str) - - Updates: - self.id_set (set): the set of all audio id's that can be sampled. - self.duration_to_list (dict): maps from the duration lower bound - to the id's of audios longer than this duration. - self.duration_to_list_per_tag (dict): maps from the tag to - the same structure as self.duration_to_list. This is to support - sampling from a target noise distribution. - - :return: whether the index was built from scratch - """ - if self.tags == tag_list: - return False - - self.tags = tag_list - if len(tag_list) == 0: - self.id_set = self.id_set_complete - else: - self.id_set = set() - for tag in tag_list: - self.id_set |= self.tag_to_id_set[tag] - - # Next, we need to take a subset of the audio files - for shared in self.shared_duration_bins: - # All bins in `shared' have the same index lists - # so we can intersect once and set all of them to this list. - lb = list(shared)[0] - intersected = list(self.id_set & self.duration_to_id_set[lb]) - duration_to_id_set = self.duration_to_id_set_per_tag[lb] - intersected_per_tag = { - tag: self.tag_to_id_set[tag] & duration_to_id_set[tag] - for tag in duration_to_id_set - } - for bin_key in shared: - self.duration_to_list[bin_key] = intersected - for tag in intersected_per_tag: - self.duration_to_list_per_tag[tag][bin_key] = \ - intersected_per_tag[tag] - assert len(self.duration_to_list) == len(self.duration_to_id_set) - return True - - def refresh_records_from_index_file(self, - audio_dir, - idx_fname, - tag_list, - bin_size=2.0): - """ Loads the index file and populates the records - for building the internal index. - - If the audio directory or index file name has changed, the whole index - is reloaded from scratch. If only the tag_list is changed, then the - desired index is built from the complete, in-memory record. - - :param audio_dir: audio directory - :type audio_dir: basestring - :param idx_fname: audio index file name - :type idex_fname: basestring - :param tag_list: list of tags we are interested in loading; - if empty, we load all. - :type tag_list: list - :param bin_size: optional argument for controlling the granularity - of duration bins - :type bin_size: float - """ - if tag_list is None: - tag_list = [] - reloaded_records = self._load_all_records_from_disk(audio_dir, - idx_fname, bin_size) - if reloaded_records or self.tags != tag_list: - self._build_index_from_records(tag_list) - logger.info('loaded {} audio files from {}' - .format(len(self.id_set), idx_fname)) - - def sample_audio(self, duration, rng=None, distr=None): - """ Uniformly draws an audio record of at least the desired duration - - :param duration: minimum desired audio duration - :type duration: float - :param rng: random number generator - :type rng: random.Random - :param distr: target distribution of audio tags. If not provided, - :type distr: dict - all audio files are sampled uniformly at random. - - :returns: success, (duration, file_size, path) - """ - if duration < 0.0: - duration = self.min_duration - i = bisect.bisect_left(self.duration_bins, duration) - if i == len(self.duration_bins): - return False, None - bin_key = self.duration_bins[i] - if distr is None: - indices = self.duration_to_list[bin_key] - else: - # If a desired audio distribution is given, we sample from it. - if rng is None: - rng = random.Random() - nprng = np.random.RandomState(rng.getrandbits(32)) - prob_masses = distr.values() - prob_masses /= np.sum(prob_masses) - tag = nprng.choice(distr.keys(), p=prob_masses) - indices = self.duration_to_list_per_tag[tag][bin_key] - if len(indices) == 0: - return False, None - else: - if rng is None: - rng = random.Random() - # duration, file size and relative path from root - s = self.idx_to_record[rng.sample(indices, 1)[0]] - s = (s[0], s[1], os.path.join(self.audio_dir, s[2])) - return True, s diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index c0a70ad1..abe1a0ec 100755 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -6,11 +6,6 @@ from __future__ import print_function import json import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor -from data_utils.augmentor.resamler import ResamplerAugmentor -from data_utils.augmentor.speed_perturb import SpeedPerturbatioAugmentor -from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor -from data_utils.augmentor.Impulse_response import ImpulseResponseAugmentor -from data_utils.augmentor.noise_speech import NoiseSpeechAugmentor class AugmentationPipeline(object): @@ -81,15 +76,5 @@ class AugmentationPipeline(object): """Return an augmentation model by the type name, and pass in params.""" if augmentor_type == "volume": return VolumePerturbAugmentor(self._rng, **params) - if augmentor_type == "resamle": - return ResamplerAugmentor(self._rng, **params) - if augmentor_type == "speed": - return SpeedPerturbatioAugmentor(self._rng, **params) - if augmentor_type == "online_bayesian_normalization": - return OnlineBayesianNormalizationAugmentor(self._rng, **params) - if augmentor_type == "Impulse_response": - return ImpulseResponseAugmentor(self._rng, **params) - if augmentor_type == "noise_speech": - return NoiseSpeechAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/implus_response.py b/data_utils/augmentor/implus_response.py deleted file mode 100755 index cc205342..00000000 --- a/data_utils/augmentor/implus_response.py +++ /dev/null @@ -1,76 +0,0 @@ -""" Impulse response""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base -from . import audio_database -from data_utils.speech import SpeechSegment - - -class ImpulseResponseAugmentor(base.AugmentorBase): - """ Instantiates an impulse response model - - :param ir_dir: directory containing impulse responses - :type ir_dir: basestring - :param tags: optional parameter for specifying what - particular impulse responses to apply. - :type tags: list - :parm tag_distr: optional noise distribution - :type tag_distr: dict - """ - - def __init__(self, rng, ir_dir, index_file, tags=None, tag_distr=None): - # Define all required parameter maps here. - self.ir_dir = ir_dir - self.index_file = index_file - - self.tags = tags - self.tag_distr = tag_distr - - self.audio_index = audio_database.AudioIndex() - self.rng = rng - - def _init_data(self): - """ Preloads stuff from disk in an attempt (e.g. list of files, etc) - to make later loading faster. If the data configuration remains the - same, this function does nothing. - - """ - self.audio_index.refresh_records_from_index_file( - self.ir_dir, self.index_file, self.tags) - - def transform_audio(self, audio_segment): - """ Convolves the input audio with an impulse response. - - :param audio_segment: input audio - :type audio_segment: AudioSegemnt - """ - # This handles the cases where the data source or directories change. - self._init_data() - - read_size = 0 - tag_distr = self.tag_distr - if not self.audio_index.has_audio(tag_distr): - if tag_distr is None: - if not self.tags: - raise RuntimeError("The ir index does not have audio " - "files to sample from.") - else: - raise RuntimeError("The ir index does not have audio " - "files of the given tags to sample " - "from.") - else: - raise RuntimeError("The ir index does not have audio " - "files to match the target ir " - "distribution.") - else: - # Querying with a negative duration triggers the index to search - # from all impulse responses. - success, record = self.audio_index.sample_audio( - -1.0, rng=self.rng, distr=tag_distr) - if success is True: - _, read_size, ir_fname = record - ir_wav = SpeechSegment.from_file(ir_fname) - audio_segment.convolve(ir_wav, allow_resampling=True) diff --git a/data_utils/augmentor/noise_speech.py b/data_utils/augmentor/noise_speech.py deleted file mode 100755 index 8cf7c27b..00000000 --- a/data_utils/augmentor/noise_speech.py +++ /dev/null @@ -1,318 +0,0 @@ -""" noise speech -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import logging -import numpy as np -import os -from collections import defaultdict - -from . import base -from . import audio_database -from data_utils.speech import SpeechSegment - -TURK = "turk" -USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"]) -HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0 -FIND_NOISE_MAX_ATTEMPTS = 20 - -logger = logging.getLogger(__name__) - - -def get_first_smaller(items, value): - index = bisect.bisect_left(items, value) - 1 - assert items[index] < value, \ - 'get_first_smaller failed! %d %d' % (items[index], value) - return items[index] - - -def get_first_larger(items, value): - 'Find leftmost value greater than value' - index = bisect.bisect_right(items, value) - assert index < len(items), \ - "no noise bin exists for this audio length (%f)" % value - assert items[index] > value, \ - 'get_first_larger failed! %d %d' % (items[index], value) - return items[index] - - -def _get_turk_noise_files(noise_dir, index_file): - """ Creates a map from duration => a list of noise filenames - - :param noise_dir: Directory of noise files which contains - "noise-samples-list" - :type noise_dir: basestring - :param index_file: Noise list - :type index_file: basestring - - returns:noise_files (defaultdict): A map of bins to noise files. - Each key is the duration, and the value is a list of noise - files binned to this duration. Each bin is 2 secs. - - Note: noise-samples-list should contain one line per noise (wav) file - along with its duration in milliseconds - """ - noise_files = defaultdict(list) - if not os.path.exists(index_file): - logger.error('No noise files were found at {}'.format(index_file)) - return noise_files - num_noise_files = 0 - rounded_durations = list(range(0, 65, 2)) - with open(index_file, 'r') as fl: - for line in fl: - fname = os.path.join(noise_dir, line.strip().split()[0]) - duration = float(line.strip().split()[1]) / 1000 - # bin the noise files into length bins rounded by 2 sec - bin_id = get_first_smaller(rounded_durations, duration) - noise_files[bin_id].append(fname) - num_noise_files += 1 - logger.info('Loaded {} turk noise files'.format(num_noise_files)) - return noise_files - - -class NoiseSpeechAugmentor(base.AugmentorBase): - """ Noise addition block - - :param snr_min: minimum signal-to-noise ratio - :type snr_min: float - :param snr_max: maximum signal-to-noise ratio - :type snr_max: float - :param noise_dir: root of where noise files are stored - :type noise_fir: basestring - :param index_file: index of noises of interest in noise_dir - :type index_file: basestring - :param source: select one from - - turk - - freesound - - chime - Note that this field is no longer required for the freesound - and chime - :type source: string - :param tags: optional parameter for specifying what - particular noises we want to add. See above for the available tags. - :type tags: list - :param tag_distr: optional noise distribution - :type tag_distr: dict - """ - - def __init__(self, - rng, - snr_min, - snr_max, - noise_dir, - source, - allow_downsampling=None, - index_file=None, - tags=None, - tag_distr=None): - # Define all required parameter maps here. - self.rng = rng - self.snr_min = snr_min - self.snr_max = snr_max - self.noise_dir = noise_dir - self.source = source - - self.allow_downsampling = allow_downsampling - self.index_file = index_file - self.tags = tags - self.tag_distr = tag_distr - - # When new noise sources are added, make sure to define the - # associated bookkeeping variables here. - self.turk_noise_files = [] - self.turk_noise_dir = None - self.audio_index = audio_database.AudioIndex() - - def _init_data(self): - """ Preloads stuff from disk in an attempt (e.g. list of files, etc) - to make later loading faster. If the data configuration remains the - same, this function does nothing. - - """ - noise_dir = self.noise_dir - index_file = self.index_file - source = self.source - if not index_file: - if source == TURK: - index_file = os.path.join(noise_dir, 'noise-samples-list') - logger.debug("index_file not provided; " + "defaulting to " + - index_file) - else: - if source != "": - assert source in USE_AUDIO_DATABASE_SOURCES, \ - "{} not supported by audio_database".format(source) - index_file = os.path.join(noise_dir, - "audio_index_commercial.txt") - logger.debug("index_file not provided; " + "defaulting to " + - index_file) - - if source == TURK: - if self.turk_noise_dir != noise_dir: - self.turk_noise_dir = noise_dir - self.turk_noise_files = _get_turk_noise_files(noise_dir, - index_file) - # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: - else: - if source != "": - assert source in USE_AUDIO_DATABASE_SOURCES, \ - "{} not supported by audio_database".format(source) - self.audio_index.refresh_records_from_index_file( - self.noise_dir, index_file, self.tags) - - def transform_audio(self, audio_segment): - """Adds walla noise - - :param audio_segment: Input audio - :type audio_segment: SpeechSegment - """ - # This handles the cases where the data source or directories change. - self._init_data - source = self.source - allow_downsampling = self.allow_downsampling - if source == TURK: - self._add_turk_noise(audio_segment, self.rng, allow_downsampling) - # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: - else: - self._add_noise(audio_segment, self.rng, allow_downsampling) - - def _sample_snr(self): - """ Returns a float sampled in [`self.snr_min`, `self.snr_max`] - if both `self.snr_min` and `self.snr_max` are non-zero. - """ - snr_min = self.snr_min - snr_max = self.snr_max - sampled_snr = self.rng.uniform(snr_min, snr_max) - return sampled_snr - - def _add_turk_noise(self, audio_segment, allow_downsampling): - """ Adds a turk noise to the input audio. - - :param audio_segment: input audio - :type audio_segment: audiosegment - :param allow_downsampling: indicates whether downsampling - is allowed - :type allow_downsampling: boolean - """ - read_size = 0 - if len(self.turk_noise_files) > 0: - snr = self._sample_snr(self.rng) - # Draw the noise file randomly from noise files that are - # slightly longer than the utterance - noise_bins = sorted(self.turk_noise_files.keys()) - # note some bins can be empty, so we can't just round up - # to the nearest 2-sec interval - rounded_duration = get_first_larger(noise_bins, - audio_segment.duration) - noise_fname = \ - self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0] - noise = SpeechSegment.from_wav_file(noise_fname) - logger.debug('noise_fname {}'.format(noise_fname)) - logger.debug('snr {}'.format(snr)) - read_size = len(noise) * 2 - # May throw exceptions, but this is caught by - # AudioFeaturizer.get_audio_files. - audio_segment.add_noise( - noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) - - def _add_noise(self, audio_segment, allow_downsampling): - """ Adds a noise indexed in audio_database.AudioIndex. - - :param audio_segment: input audio - :type audio_segment: SpeechSegment - :param allow_downsampling: indicates whether downsampling - is allowed - :type allow_downsampling: boolean - - Returns: - (SpeechSegment, int) - - sound with turk noise added - - number of bytes read from disk - """ - read_size = 0 - tag_distr = self.tag_distr - if not self.audio_index.has_audio(tag_distr): - if tag_distr is None: - if not self.tags: - raise RuntimeError("The noise index does not have audio " - "files to sample from.") - else: - raise RuntimeError("The noise index does not have audio " - "files of the given tags to sample " - "from.") - else: - raise RuntimeError("The noise index does not have audio " - "files to match the target noise " - "distribution.") - else: - # Compute audio segment related statistics - audio_duration = audio_segment.duration - - # Sample relevant augmentation parameters. - snr = self._sample_snr(self.rng) - - # Perhaps, we may not have a sufficiently long noise, so we need - # to search iteratively. - min_duration = audio_duration + 0.25 - for _ in range(FIND_NOISE_MAX_ATTEMPTS): - logger.debug("attempting to find noise of length " - "at least {}".format(min_duration)) - - success, record = \ - self.audio_index.sample_audio(min_duration, - rng=self.rng, - distr=tag_distr) - - if success is True: - noise_duration, read_size, noise_fname = record - - # Assert after logging so we know - # what caused augmentation to fail. - logger.debug("noise_fname {}".format(noise_fname)) - logger.debug("snr {}".format(snr)) - assert noise_duration >= min_duration - break - - # Decrease the desired minimum duration linearly. - # If the value becomes smaller than some threshold, - # we half the value instead. - if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD: - min_duration -= 2.0 - else: - min_duration *= 0.5 - - if success is False: - logger.info("Failed to find a noise file") - return - - diff_duration = audio_duration + 0.25 - noise_duration - if diff_duration >= 0.0: - # Here, the noise is shorter than the audio file, so - # we pad with zeros to make sure the noise sound is applied - # with a uniformly random shift. - noise = SpeechSegment.from_file(noise_fname) - noise = noise.pad_silence(diff_duration, sides="both") - else: - # The noise clip is at least ~25 ms longer than the audio - # segment here. - diff_duration = int(noise_duration * audio_segment.sample_rate) - \ - int(audio_duration * audio_segment.sample_rate) - \ - int(0.02 * audio_segment.sample_rate) - start = float(self.rng.randint(0, diff_duration)) / \ - audio.sample_rate - finish = min(start + audio_duration + 0.2, noise_duration) - noise = SpeechSegment.slice_from_file(noise_fname, start, - finish) - - if len(noise) < len(audio_segment): - # This is to ensure that the noise clip is at least as - # long as the audio segment. - num_samples_to_pad = len(audio_segment) - len(noise) - # Padding this amount of silence on both ends ensures that - # the placement of the noise clip is uniformly random. - silence = SpeechSegment( - np.zeros(num_samples_to_pad), audio_segment.sample_rate) - noise = SpeechSegment.concatenate(silence, noise, silence) - - audio_segment.add_noise( - noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) diff --git a/data_utils/augmentor/online_bayesian_normalization.py b/data_utils/augmentor/online_bayesian_normalization.py deleted file mode 100755 index bc2d6c1b..00000000 --- a/data_utils/augmentor/online_bayesian_normalization.py +++ /dev/null @@ -1,57 +0,0 @@ -""" Online bayesian normalization -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base - - -class OnlineBayesianNormalizationAugmentor(base.AugmentorBase): - """ - Instantiates an online bayesian normalization module. - :param target_db: Target RMS value in decibels - :type target_db: func[int->scalar] - :param prior_db: Prior RMS estimate in decibels - :type prior_db: func[int->scalar] - :param prior_samples: Prior strength in number of samples - :type prior_samples: func[int->scalar] - :param startup_delay: Start-up delay in seconds during - which normalization statistics is accrued. - :type starup_delay: func[int->scalar] - """ - - def __init__(self, - rng, - target_db, - prior_db, - prior_samples, - startup_delay=base.parse_parameter_from(0.0)): - - self.target_db = target_db - self.prior_db = prior_db - self.prior_samples = prior_samples - self.startup_delay = startup_delay - self.rng = rng - - def transform_audio(self, audio_segment): - """ - Normalizes the input audio using the online Bayesian approach. - - :param audio_segment: input audio - :type audio_segment: SpeechSegment - :param iteration: current iteration - :type iteration: int - :param text: audio transcription - :type text: basestring - :param rng: RNG to use for augmentation - :type rng: random.Random - - """ - read_size = 0 - target_db = self.target_db(iteration) - prior_db = self.prior_db(iteration) - prior_samples = self.prior_samples(iteration) - startup_delay = self.startup_delay(iteration) - audio.normalize_online_bayesian( - target_db, prior_db, prior_samples, startup_delay=startup_delay) diff --git a/data_utils/augmentor/resampler.py b/data_utils/augmentor/resampler.py deleted file mode 100755 index 1b959be5..00000000 --- a/data_utils/augmentor/resampler.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base - - -class ResamplerAugmentor(base.AugmentorBase): - """ Instantiates a resampler module. - - :param new_sample_rate: New sample rate in Hz - :type new_sample_rate: func[int->scalar] - :param rng: Random generator object. - :type rng: random.Random - """ - - def __init__(self, rng, new_sample_rate): - self.new_sample_rate = new_sample_rate - self._rng = rng - - def transform_audio(self, audio_segment): - """ Resamples the input audio to the target sample rate. - - Note that this is an in-place transformation. - - :param audio: input audio - :type audio: SpeechDLSegment - """ - new_sample_rate = self.new_sample_rate - audio.resample(new_sample_rate) \ No newline at end of file diff --git a/data_utils/augmentor/speed_perturb.py b/data_utils/augmentor/speed_perturb.py deleted file mode 100755 index e09be5f7..00000000 --- a/data_utils/augmentor/speed_perturb.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Speed perturbation module for making ASR robust to different voice -types (high pitched, low pitched, etc) -Samples uniformly between speed_min and speed_max -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base - - -class SpeedPerturbatioAugmentor(base.AugmentorBase): - """ - Instantiates a speed perturbation module. - - See reference paper here: - - http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf - - :param speed_min: Lower bound on new rate to sample - :type speed_min: func[int->scalar] - :param speed_max: Upper bound on new rate to sample - :type speed_max: func[int->scalar] - """ - - def __init__(self, rng, speed_min, speed_max): - - if (speed_min < 0.9): - raise ValueError( - "Sampling speed below 0.9 can cause unnatural effects") - if (speed_min > 1.1): - raise ValueError( - "Sampling speed above 1.1 can cause unnatural effects") - self.speed_min = speed_min - self.speed_max = speed_max - self.rng = rng - - def transform_audio(self, audio_segment): - """ - Samples a new speed rate from the given range and - changes the speed of the given audio clip. - - Note that this is an in-place transformation. - - :param audio_segment: input audio - :type audio_segment: SpeechDLSegment - """ - read_size = 0 - speed_min = self.speed_min(iteration) - speed_max = self.speed_max(iteration) - sampled_speed = rng.uniform(speed_min, speed_max) - audio = audio.change_speed(sampled_speed) diff --git a/data_utils/augmentor/volume_perturb.py b/data_utils/augmentor/volume_perturb.py index 15055b91..a5a9f6ca 100755 --- a/data_utils/augmentor/volume_perturb.py +++ b/data_utils/augmentor/volume_perturb.py @@ -3,10 +3,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from . import base +from data_utils.augmentor.base import AugmentorBase -class VolumePerturbAugmentor(base.AugmentorBase): +class VolumePerturbAugmentor(AugmentorBase): """Augmentation model for adding random volume perturbation. This is used for multi-loudness training of PCEN. See diff --git a/requirements.txt b/requirements.txt index 58a93deb..c37e88ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ SoundFile==0.9.0.post1 wget==3.2 +scikits.samplerate==0.3.3 +scipy==0.13.0b1 From d1ee10be102263da5fbfac1e131c31ed605b5ad0 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Fri, 16 Jun 2017 18:29:56 +0800 Subject: [PATCH 03/11] modify audio and speech --- data_utils/audio.py | 14 ++++++++------ data_utils/speech.py | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index ee4e6d84..066437dc 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -104,7 +104,8 @@ class AudioSegment(object): io.BytesIO(bytes), dtype='float32') return cls(samples, sample_rate) - def concatenate(self, *segments): + @classmethod + def concatenate(cls, *segments): """Concatenate an arbitrary number of audio segments together. :param *segments: Input audio segments @@ -123,11 +124,11 @@ class AudioSegment(object): if sample_rate != seg._sample_rate: raise ValueError("Can't concatenate segments with " "different sample rates") - if type(seg) is not type(self): + if type(seg) is not cls: raise TypeError("Only audio segments of the same type " "instance can be concatenated.") samples = np.concatenate([seg.samples for seg in segments]) - return type(self)(samples, sample_rate) + return cls(samples, sample_rate) def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. @@ -355,13 +356,14 @@ class AudioSegment(object): """ if duration == 0.0: return self + cls = type(self) silence = self.make_silence(duration, self._sample_rate) if sides == "beginning": - padded = self.concatenate(silence, self) + padded = cls.concatenate(silence, self) elif sides == "end": - padded = self.concatenate(self, silence) + padded = cls.concatenate(self, silence) elif sides == "both": - padded = self.concatenate(silence, self, silence) + padded = cls.concatenate(silence, self, silence) else: raise ValueError("Unknown value for the kwarg %s" % sides) self._samples = padded._samples diff --git a/data_utils/speech.py b/data_utils/speech.py index 48db595b..5d1fc15a 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -65,6 +65,32 @@ class SpeechSegment(AudioSegment): audio = AudioSegment.from_bytes(bytes) return cls(audio.samples, audio.sample_rate, transcript) + @classmethod + def concatenate(cls, *segments): + """Concatenate an arbitrary number of audio segments together. + + :param *segments: Input speech segments + :type *segments: SpeechSegment + :return: Speech segment instance. + :rtype: SpeechSegment + :raises ValueError: If number of segments is zero, or if sample_rate + not match between two audio segments + :raises TypeError: If item of segments is not Audiosegment instance + """ + # Perform basic sanity-checks. + if len(segments) == 0: + raise ValueError("No audio segments are given to concatenate.") + sample_rate = segments[0]._sample_rate + for seg in segments: + if sample_rate != seg._sample_rate: + raise ValueError("Can't concatenate segments with " + "different sample rates") + if type(seg) is not cls: + raise TypeError("Only speech segments of the same type " + "instance can be concatenated.") + samples = np.concatenate([seg.samples for seg in segments]) + return cls(samples, sample_rate, seg._transcript) + @property def transcript(self): """Return the transcript text. From 5ca270d30a34c71b0b851ed376fb7e7d90b3cf17 Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sat, 17 Jun 2017 09:03:18 +0800 Subject: [PATCH 04/11] add audio file --- data_utils/audio.py | 245 ++++++++++++++++++++----------------------- data_utils/speech.py | 55 ++++++++-- 2 files changed, 161 insertions(+), 139 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 066437dc..1f75da8a 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -47,32 +47,6 @@ class AudioSegment(object): """Return whether two objects are unequal.""" return not self.__eq__(other) - def __len__(self): - """Returns length of segment in samples.""" - return self.num_samples - - def __add__(self, other): - """Add samples from another segment to those of this segment and return - a new segment (sample-wise addition, not segment concatenation). - - :param other: Segment containing samples to be - added in. - :type other: AudioSegment - :return: New segment containing resulting samples. - :rtype: AudioSegment - :raise TypeError: If sample rates of segments don't match, - or if length of segments don't match. - """ - if type(self) != type(other): - raise TypeError("Cannot add segment of different type: {}" - .format(type(other))) - if self._sample_rate != other._sample_rate: - raise TypeError("Sample rates must match to add segments.") - if len(self._samples) != len(other._samples): - raise TypeError("Segment lengths must match to add segments.") - samples = self.samples + other.samples - return type(self)(samples, sample_rate=self._sample_rate) - def __str__(self): """Return human-readable representation of segment.""" return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " @@ -108,13 +82,13 @@ class AudioSegment(object): def concatenate(cls, *segments): """Concatenate an arbitrary number of audio segments together. - :param *segments: Input audio segments + :param *segments: Input audio segments. :type *segments: AudioSegment - :return: Audio segment instance. + :return: Audio segment instance as concatenating results. :rtype: AudioSegment - :raises ValueError: If number of segments is zero, or if sample_rate - not match between two audio segments - :raises TypeError: If item of segments is not Audiosegment instance + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any two segments does not match. + :raises TypeError: If every segment in is not Audiosegment instance. """ # Perform basic sanity-checks. if len(segments) == 0: @@ -155,12 +129,13 @@ class AudioSegment(object): format='WAV', subtype=subtype_map[dtype]) - def slice_from_file(self, file, start=None, end=None): + @classmethod + def slice_from_file(cls, file, start=None, end=None): """Loads a small section of an audio without having to load the entire file into the memory which can be incredibly wasteful. - :param file: Input audio filepath - :type file: basestring + :param file: Input audio filepath or file object. + :type file: basestring|file :param start: Start time in seconds. If start is negative, it wraps around from the end. If not provided, this function reads from the very beginning. @@ -169,9 +144,11 @@ class AudioSegment(object): from the end. If not provided, the default behvaior is to read to the end of the file. :type end: float - :return: The specified slice of input audio in the audio.AudioSegment format. + :return: AudioSegment instance of the specified slice of the input + audio file. :rtype: AudioSegment - :rainse ValueError: If the position is error, or if the time is out bounds. + :raise ValueError: If start or end is incorrectly set, e.g. out of + bounds in time. """ sndfile = soundfile.SoundFile(file) sample_rate = sndfile.samplerate @@ -184,40 +161,60 @@ class AudioSegment(object): end += duration if start < 0.0: raise ValueError("The slice start position (%f s) is out of " - "bounds. Filename: %s" % (start, file)) + "bounds." % start) if end < 0.0: - raise ValueError("The slice end position (%f s) is out of bounds " - "Filename: %s" % (end, file)) + raise ValueError("The slice end position (%f s) is out of bounds." % + end) if start > end: raise ValueError("The slice start position (%f s) is later than " "the slice end position (%f s)." % (start, end)) if end > duration: - raise ValueError("The slice end time (%f s) is out of bounds " - "(> %f s) Filename: %s" % (end, duration, file)) + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end, duration)) start_frame = int(start * sample_rate) end_frame = int(end * sample_rate) sndfile.seek(start_frame) data = sndfile.read(frames=end_frame - start_frame, dtype='float32') - return type(self)(data, sample_rate) + return cls(data, sample_rate) - def make_silence(self, duration, sample_rate): + @classmethod + def make_silence(cls, duration, sample_rate): """Creates a silent audio segment of the given duration and sample rate. - :param duration: Length of silence in seconds + :param duration: Length of silence in seconds. :type duration: float - :param sample_rate: Sample rate + :param sample_rate: Sample rate. :type sample_rate: float - :return: Silence of the given duration + :return: Silent AudioSegment instance of the given duration. :rtype: AudioSegment """ samples = np.zeros(int(duration * sample_rate)) - return type(self)(samples, sample_rate) + return cls(samples, sample_rate) + + def superimposed(self, other): + """Add samples from another segment to those of this segment + (sample-wise addition, not segment concatenation). + + :param other: Segment containing samples to be added in. + :type other: AudioSegments + :raise TypeError: If type of two segments don't match. + :raise ValueError: If the sample_rate of two segments not equal, or if + the length of segments don't match. + """ + if type(self) != type(other): + raise TypeError("Cannot add segments of different types: %s " + "and %s." % (type(self), type(other))) + if self._sample_rate != other._sample_rate: + raise ValueError("Sample rates must match to add segments.") + if len(self._samples) != len(other._samples): + raise ValueError("Segment lengths must match to add segments.") + self._samples += other._samples def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. - :param dtype: Data type for export samples. Options: 'int16', 'int32', + :param dtype: Data type for export samples. Options: 'int16','int32', 'float32', 'float64'. Default is 'float32'. :type dtype: str :return: Byte string containing audio content. @@ -258,16 +255,17 @@ class AudioSegment(object): self._samples = np.interp(new_indices, old_indices, self._samples) def normalize(self, target_db=-20, max_gain_db=300.0): - """Normalize audio to be desired RMS value in decibels. + """Normalize audio to be of the desired RMS value in decibels. Note that this is an in-place transformation. - :param target_db: Target RMS value in decibels. This value should - be less than 0.0 as 0.0 is full-scale audio. + :param target_db: Target RMS value in decibels. This value should be + less than 0.0 as 0.0 is full-scale audio. :type target_db: float :param max_gain_db: Max amount of gain in dB that can be applied for - normalization. This is to prevent nans when attempting - to normalize a signal consisting of all zeros. + normalization. This is to prevent nans when + attempting to normalize a signal consisting of + all zeros. :type max_gain_db: float :raises ValueError: If the required gain to normalize the segment to the target_db value exceeds max_gain_db. @@ -275,9 +273,9 @@ class AudioSegment(object): gain = target_db - self.rms_db if gain > max_gain_db: raise ValueError( - "Unable to normalize segment to %f dB because it has an RMS " - "value of %f dB and the difference exceeds max_gain_db (%f dB)" - % (target_db, self.rms_db, max_gain_db)) + "Unable to normalize segment to %f dB because the " + "the probable gain have exceeds max_gain_db (%f dB)" % + (target_db, max_gain_db)) self.apply_gain(min(max_gain_db, target_db - self.rms_db)) def normalize_online_bayesian(self, @@ -285,30 +283,30 @@ class AudioSegment(object): prior_db, prior_samples, startup_delay=0.0): - """Normalize audio using a production-compatible online/causal algorithm. - This uses an exponential likelihood and gamma prior to make online estimates - of the RMS even when there are very few samples. + """Normalize audio using a production-compatible online/causal + algorithm. This uses an exponential likelihood and gamma prior to + make online estimates of the RMS even when there are very few samples. Note that this is an in-place transformation. - :param target_db: Target RMS value in decibels + :param target_db: Target RMS value in decibels. :type target_bd: float - :param prior_db: Prior RMS estimate in decibels + :param prior_db: Prior RMS estimate in decibels. :type prior_db: float - :param prior_samples: Prior strength in number of samples + :param prior_samples: Prior strength in number of samples. :type prior_samples: float - :param startup_delay: Default 0.0 s. If provided, this function will accrue - statistics for the first startup_delay seconds before - applying online normalization. + :param startup_delay: Default 0.0 s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. :type startup_delay: float """ - # Estimate total RMS online + # Estimate total RMS online. startup_sample_idx = min(self.num_samples - 1, int(self.sample_rate * startup_delay)) prior_mean_squared = 10.**(prior_db / 10.) prior_sum_of_squares = prior_mean_squared * prior_samples cumsum_of_squares = np.cumsum(self.samples**2) - sample_count = np.arange(len(self)) + 1 + sample_count = np.arange(len(self.num_samples)) + 1 if startup_sample_idx > 0: cumsum_of_squares[:startup_sample_idx] = \ cumsum_of_squares[startup_sample_idx] @@ -317,42 +315,40 @@ class AudioSegment(object): mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / (sample_count + prior_samples)) rms_estimate_db = 10 * np.log10(mean_squared_estimate) - # Compute required time-varying gain + # Compute required time-varying gain. gain_db = target_db - rms_estimate_db self.apply_gain(gain_db) def resample(self, target_sample_rate, quality='sinc_medium'): - """Resample audio segment. This resamples the audio to a new - sample rate. + """Resample the audio to a target sample rate. Note that this is an in-place transformation. - :param target_sample_rate: Target sample rate + :param target_sample_rate: Target sample rate. :type target_sample_rate: int :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. Sets resampling speed/quality tradeoff. See http://www.mega-nerd.com/SRC/api_misc.html#Converters - :type quality: basestring + :type quality: str """ resample_ratio = target_sample_rate / self._sample_rate - new_samples = scikits.samplerate.resample( + self._samples = scikits.samplerate.resample( self._samples, r=resample_ratio, type=quality) - self._samples = new_samples self._sample_rate = target_sample_rate def pad_silence(self, duration, sides='both'): - """Pads this audio sample with a period of silence. + """Pad this audio sample with a period of silence. Note that this is an in-place transformation. - :param duration: Length of silence in seconds to pad + :param duration: Length of silence in seconds to pad. :type duration: float - :param sides: Position for padding - 'beginning' - adds silence in the beginning - 'end' - adds silence in the end + :param sides: Position for padding: + 'beginning' - adds silence in the beginning; + 'end' - adds silence in the end; 'both' - adds silence in both the beginning and the end. :type sides: str - :raises ValueError: If the sides not surport + :raises ValueError: If sides is not supported. """ if duration == 0.0: return self @@ -367,51 +363,41 @@ class AudioSegment(object): else: raise ValueError("Unknown value for the kwarg %s" % sides) self._samples = padded._samples - self._sample_rate = padded._sample_rate def subsegment(self, start_sec=None, end_sec=None): """Return new AudioSegment containing audio between given boundaries. - :param start_sec: Beginning of subsegment in seconds, - (beginning of segment if None). + :param start_sec: Beginning of subsegment in seconds. :type start_sec: float - :param end_sec: End of subsegment in seconds, - (end of segment if None). + :param end_sec: End of subsegment in seconds. :type end_sec: float - :return: New AudioSegment containing specified subsegment. - :rtype: AudioSegment """ start_sec = 0.0 if start_sec is None else start_sec end_sec = self.duration if end_sec is None else end_sec - # negative boundaries are relative to end of segment if start_sec < 0.0: start_sec = self.duration + start_sec if end_sec < 0.0: end_sec = self.duration + end_sec start_sample = int(round(start_sec * self._sample_rate)) end_sample = int(round(end_sec * self._sample_rate)) - samples = self._samples[start_sample:end_sample] - return type(self)(samples, sample_rate=self._sample_rate) + self._samples = self._samples[start_sample:end_sample] def random_subsegment(self, subsegment_length, rng=None): """Return a random subsegment of a specified length in seconds. :param subsegment_length: Subsegment length in seconds. :type subsegment_length: float - :param rng: Random number generator state + :param rng: Random number generator state. :type rng: random.Random - :return: New AudioSegment containing random subsegment - of original segment - :rtype: AudioSegment - :raises ValueError: If the length of subsegment greater than origineal - segemnt. + :raises ValueError: If the length of subsegment greater than + origineal segemnt. """ rng = random.Random() if rng is None else rng if subsegment_length > self.duration: raise ValueError("Length of subsegment must not be greater " "than original segment.") start_time = rng.uniform(0.0, self.duration - subsegment_length) - return self.subsegment(start_time, start_time + subsegment_length) + self.subsegment(start_time, start_time + subsegment_length) def convolve(self, impulse_segment, allow_resample=False): """Convolve this audio segment with the given filter. @@ -420,10 +406,10 @@ class AudioSegment(object): :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment - :param allow_resample: indicates whether resampling is allowed when - the impulse_segment has a different sample - rate from this signal. - :type allow_resample: boolean + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool :raises ValueError: If the sample rate is not match between two audio segments and resample is not allowed. """ @@ -443,9 +429,10 @@ class AudioSegment(object): :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment - :param allow_resample: indicates whether resampling is allowed when - the impulse_segment has a different sample rate from this signal. - :type allow_resample: boolean + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool """ target_db = self.rms_db self.convolve(impulse_segment, allow_resample=allow_resample) @@ -465,42 +452,36 @@ class AudioSegment(object): :type noise: AudioSegment :param snr_dB: Signal-to-Noise Ratio, in decibels. :type snr_dB: float - :param allow_downsampling: whether to allow the noise signal to be downsampled - to match the base signal sample rate. - :type allow_downsampling: boolean - :param max_gain_db: Maximum amount of gain to apply to noise signal before - adding it in. This is to prevent attempting to apply infinite - gain to a zero signal. + :param allow_downsampling: Whether to allow the noise signal to be + downsampled to match the base signal sample + rate. + :type allow_downsampling: bool + :param max_gain_db: Maximum amount of gain to apply to noise signal + before adding it in. This is to prevent attempting + to apply infinite gain to a zero signal. :type max_gain_db: float :param rng: Random number generator state. - :type rng: random.Random - :raises ValueError: If the sample rate does not match between the two audio segments - and resample is not allowed, or if the duration of noise segments - is shorter than original audio segments. + :type rng: None|random.Random + :raises ValueError: If the sample rate does not match between the two + audio segments and resample is not allowed, or if + the duration of noise segments is shorter than + original audio segments. """ rng = random.Random() if rng is None else rng if allow_downsampling and noise.sample_rate > self.sample_rate: noise = noise.resample(self.sample_rate) if noise.sample_rate != self.sample_rate: - raise ValueError("Noise sample rate (%d Hz) is not equal to " - "base signal sample rate (%d Hz)." % - (noise.sample_rate, self.sample_rate)) + raise ValueError("Noise sample rate (%d Hz) is not equal to base " + "signal sample rate (%d Hz)." % (noise.sample_rate, + self.sample_rate)) if noise.duration < self.duration: - raise ValueError("Noise signal (%f sec) must be at " - "least as long as base signal (%f sec)." % + raise ValueError("Noise signal (%f sec) must be at least as long as" + " base signal (%f sec)." % (noise.duration, self.duration)) - noise_gain_db = self.rms_db - noise.rms_db - snr_dB - noise_gain_db = min(max_gain_db, noise_gain_db) - noise_subsegment = noise.random_subsegment(self.duration, rng=rng) - output = self + self.tranform_noise(noise_subsegment, noise_gain_db) - self._samples = output._samples - self._sample_rate = output._sample_rate - - def tranform_noise(self, noise_subsegment, noise_gain_db): - """ tranform noise file - """ - return type(self)(noise_subsegment._samples * (10.**( - noise_gain_db / 20.)), noise_subsegment._sample_rate) + noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db) + noise.random_subsegment(self.duration, rng=rng) + noise.apply_gain(noise_gain_db) + self.superimposed(noise) @property def samples(self): @@ -571,7 +552,7 @@ class AudioSegment(object): Audio sample type is usually integer or float-point. For integer type, float32 will be rescaled from [-1, 1] to the maximum range supported by the integer type. - + This is for writing a audio file. """ dtype = np.dtype(dtype) diff --git a/data_utils/speech.py b/data_utils/speech.py index 5d1fc15a..443df68c 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -67,20 +67,20 @@ class SpeechSegment(AudioSegment): @classmethod def concatenate(cls, *segments): - """Concatenate an arbitrary number of audio segments together. + """Concatenate an arbitrary number of speech segments together. - :param *segments: Input speech segments + :param *segments: Input speech segments. :type *segments: SpeechSegment :return: Speech segment instance. :rtype: SpeechSegment - :raises ValueError: If number of segments is zero, or if sample_rate - not match between two audio segments - :raises TypeError: If item of segments is not Audiosegment instance + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any two segments does not match. + :raises TypeError: If every segment in is not Audiosegment instance. """ - # Perform basic sanity-checks. if len(segments) == 0: raise ValueError("No audio segments are given to concatenate.") sample_rate = segments[0]._sample_rate + transcripts = "" for seg in segments: if sample_rate != seg._sample_rate: raise ValueError("Can't concatenate segments with " @@ -88,8 +88,49 @@ class SpeechSegment(AudioSegment): if type(seg) is not cls: raise TypeError("Only speech segments of the same type " "instance can be concatenated.") + transcripts += seg._transcript samples = np.concatenate([seg.samples for seg in segments]) - return cls(samples, sample_rate, seg._transcript) + return cls(samples, sample_rate, transcripts) + + @classmethod + def slice_from_file(cls, filepath, start=None, end=None, transcript=""): + """Loads a small section of an speech without having to load + the entire file into the memory which can be incredibly wasteful. + + :param filepath: Filepath or file object to audio file. + :type filepath: basestring|file + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :param transcript: Transcript text for the speech. if not provided, + the defaults is an empty string. + :type transript: basestring + :return: SpeechSegment instance of the specified slice of the input + speech file. + :rtype: SpeechSegment + """ + audio = Audiosegment.slice_from_file(filepath, start, end) + return cls(audio.samples, audio.sample_rate, transcripts) + + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent speech segment of the given duration and + sample rate. + + :param duration: Length of silence in seconds. + :type duration: float + :param sample_rate: Sample rate. + :type sample_rate: float + :return: Silence of the given duration. + :rtype: AudioSegment + """ + audio = AudioSegment.make_silence(duration, sample_rate) + return cls(audio.samples, audio.sample_rate, "") @property def transcript(self): From b8341da63dfa2baccff73c197e0e3dae336ef4de Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sun, 18 Jun 2017 16:23:30 +0800 Subject: [PATCH 05/11] add audio augmentation --- data_utils/audio.py | 3 ++- data_utils/speech.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 1f75da8a..3c671b69 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -88,7 +88,8 @@ class AudioSegment(object): :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every segment in is not Audiosegment instance. + :raises TypeError: If every item in segments is not Audiosegment + instance. """ # Perform basic sanity-checks. if len(segments) == 0: diff --git a/data_utils/speech.py b/data_utils/speech.py index 443df68c..66f22b24 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -75,7 +75,8 @@ class SpeechSegment(AudioSegment): :rtype: SpeechSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every segment in is not Audiosegment instance. + :raises TypeError: If every item in segments is not Audiosegment + instance. """ if len(segments) == 0: raise ValueError("No audio segments are given to concatenate.") From 107f8b89ae5f961748b89dfe1153cf4ef0288c6b Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sun, 18 Jun 2017 16:47:09 +0800 Subject: [PATCH 06/11] add audio augmentation --- data_utils/audio.py | 6 +++--- data_utils/speech.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 3c671b69..1ad20bf3 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -88,7 +88,7 @@ class AudioSegment(object): :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every item in segments is not Audiosegment + :raises TypeError: If every item in segments is not AudioSegment instance. """ # Perform basic sanity-checks. @@ -296,7 +296,7 @@ class AudioSegment(object): :type prior_db: float :param prior_samples: Prior strength in number of samples. :type prior_samples: float - :param startup_delay: Default 0.0 s. If provided, this function will + :param startup_delay: Default 0.0s. If provided, this function will accrue statistics for the first startup_delay seconds before applying online normalization. :type startup_delay: float @@ -401,7 +401,7 @@ class AudioSegment(object): self.subsegment(start_time, start_time + subsegment_length) def convolve(self, impulse_segment, allow_resample=False): - """Convolve this audio segment with the given filter. + """Convolve this audio segment with the given impulse_segment. Note that this is an in-place transformation. diff --git a/data_utils/speech.py b/data_utils/speech.py index 66f22b24..94ead1e8 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -75,11 +75,11 @@ class SpeechSegment(AudioSegment): :rtype: SpeechSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every item in segments is not Audiosegment + :raises TypeError: If every item in segments is not SpeechSegment instance. """ if len(segments) == 0: - raise ValueError("No audio segments are given to concatenate.") + raise ValueError("No speech segments are given to concatenate.") sample_rate = segments[0]._sample_rate transcripts = "" for seg in segments: @@ -116,7 +116,7 @@ class SpeechSegment(AudioSegment): :rtype: SpeechSegment """ audio = Audiosegment.slice_from_file(filepath, start, end) - return cls(audio.samples, audio.sample_rate, transcripts) + return cls(audio.samples, audio.sample_rate, transcript) @classmethod def make_silence(cls, duration, sample_rate): @@ -128,7 +128,7 @@ class SpeechSegment(AudioSegment): :param sample_rate: Sample rate. :type sample_rate: float :return: Silence of the given duration. - :rtype: AudioSegment + :rtype: SpeechSegment """ audio = AudioSegment.make_silence(duration, sample_rate) return cls(audio.samples, audio.sample_rate, "") From 21161b01653b98ea18903ff5bee07a127eee643d Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sun, 18 Jun 2017 17:11:58 +0800 Subject: [PATCH 07/11] add audio file --- data_utils/audio.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 1ad20bf3..fd1f93df 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -87,7 +87,7 @@ class AudioSegment(object): :return: Audio segment instance as concatenating results. :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the - sample_rate of any two segments does not match. + sample_rate of any two segment does not match. :raises TypeError: If every item in segments is not AudioSegment instance. """ @@ -412,7 +412,7 @@ class AudioSegment(object): rate from this signal. :type allow_resample: bool :raises ValueError: If the sample rate is not match between two - audio segments and resample is not allowed. + audio segments when resample is not allowed. """ if allow_resample and self.sample_rate != impulse_segment.sample_rate: impulse_segment = impulse_segment.resample(self.sample_rate) @@ -464,8 +464,8 @@ class AudioSegment(object): :param rng: Random number generator state. :type rng: None|random.Random :raises ValueError: If the sample rate does not match between the two - audio segments and resample is not allowed, or if - the duration of noise segments is shorter than + audio segments when downsampling is not allowed, or + if the duration of noise segments is shorter than original audio segments. """ rng = random.Random() if rng is None else rng From 25ce7ebe7b1029e823a9cdb758e808f6a0e0995e Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Sun, 18 Jun 2017 18:22:48 +0800 Subject: [PATCH 08/11] add audio file --- data_utils/audio.py | 4 ++-- data_utils/speech.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index fd1f93df..37f4f0ba 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -82,8 +82,8 @@ class AudioSegment(object): def concatenate(cls, *segments): """Concatenate an arbitrary number of audio segments together. - :param *segments: Input audio segments. - :type *segments: AudioSegment + :param *segments: Input audio segments to be concatenated. + :type *segments: tuple of AudioSegment :return: Audio segment instance as concatenating results. :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the diff --git a/data_utils/speech.py b/data_utils/speech.py index 94ead1e8..00190009 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -69,8 +69,8 @@ class SpeechSegment(AudioSegment): def concatenate(cls, *segments): """Concatenate an arbitrary number of speech segments together. - :param *segments: Input speech segments. - :type *segments: SpeechSegment + :param *segments: Input speech segments to be concatenated. + :type *segments: tuple of SpeechSegment :return: Speech segment instance. :rtype: SpeechSegment :raises ValueError: If the number of segments is zero, or if the From ddb2bdc1906223733dd5b1a2ad15a54492681f5b Mon Sep 17 00:00:00 2001 From: chrisxu2016 <823254351@qq.com> Date: Mon, 19 Jun 2017 00:08:05 +0800 Subject: [PATCH 09/11] add audio file --- data_utils/audio.py | 64 ++++++++++++++++++++++++++++++-------------- data_utils/speech.py | 10 +++---- 2 files changed, 49 insertions(+), 25 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 37f4f0ba..5d02feb6 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -9,6 +9,7 @@ import soundfile import scikits.samplerate from scipy import signal import random +import copy class AudioSegment(object): @@ -87,9 +88,8 @@ class AudioSegment(object): :return: Audio segment instance as concatenating results. :rtype: AudioSegment :raises ValueError: If the number of segments is zero, or if the - sample_rate of any two segment does not match. - :raises TypeError: If every item in segments is not AudioSegment - instance. + sample_rate of any segments does not match. + :raises TypeError: If any segment is not AudioSegment instance. """ # Perform basic sanity-checks. if len(segments) == 0: @@ -101,7 +101,7 @@ class AudioSegment(object): "different sample rates") if type(seg) is not cls: raise TypeError("Only audio segments of the same type " - "instance can be concatenated.") + "can be concatenated.") samples = np.concatenate([seg.samples for seg in segments]) return cls(samples, sample_rate) @@ -180,8 +180,7 @@ class AudioSegment(object): @classmethod def make_silence(cls, duration, sample_rate): - """Creates a silent audio segment of the given duration and - sample rate. + """Creates a silent audio segment of the given duration and sample rate. :param duration: Length of silence in seconds. :type duration: float @@ -193,15 +192,17 @@ class AudioSegment(object): samples = np.zeros(int(duration * sample_rate)) return cls(samples, sample_rate) - def superimposed(self, other): + def superimpose(self, other): """Add samples from another segment to those of this segment (sample-wise addition, not segment concatenation). + Note that this is an in-place transformation. + :param other: Segment containing samples to be added in. :type other: AudioSegments :raise TypeError: If type of two segments don't match. - :raise ValueError: If the sample_rate of two segments not equal, or if - the length of segments don't match. + :raise ValueError: If the sample rates of the two segments are not + equal, or if the lengths of segments don't match. """ if type(self) != type(other): raise TypeError("Cannot add segments of different types: %s " @@ -215,7 +216,7 @@ class AudioSegment(object): def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. - :param dtype: Data type for export samples. Options: 'int16','int32', + :param dtype: Data type for export samples. Options: 'int16', 'int32', 'float32', 'float64'. Default is 'float32'. :type dtype: str :return: Byte string containing audio content. @@ -362,16 +363,20 @@ class AudioSegment(object): elif sides == "both": padded = cls.concatenate(silence, self, silence) else: - raise ValueError("Unknown value for the kwarg %s" % sides) + raise ValueError("Unknown value for the sides %s" % sides) self._samples = padded._samples def subsegment(self, start_sec=None, end_sec=None): - """Return new AudioSegment containing audio between given boundaries. + """Cut the AudioSegment between given boundaries. + + Note that this is an in-place transformation. :param start_sec: Beginning of subsegment in seconds. :type start_sec: float :param end_sec: End of subsegment in seconds. :type end_sec: float + :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out + of bounds in time. """ start_sec = 0.0 if start_sec is None else start_sec end_sec = self.duration if end_sec is None else end_sec @@ -379,19 +384,33 @@ class AudioSegment(object): start_sec = self.duration + start_sec if end_sec < 0.0: end_sec = self.duration + end_sec + if start_sec < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start_sec) + if end_sec < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end_sec) + if start_sec > end_sec: + raise ValueError("The slice start position (%f s) is later than " + "the end position (%f s)." % (start_sec, end_sec)) + if end_sec > self.duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end_sec, self.duration)) start_sample = int(round(start_sec * self._sample_rate)) end_sample = int(round(end_sec * self._sample_rate)) self._samples = self._samples[start_sample:end_sample] def random_subsegment(self, subsegment_length, rng=None): - """Return a random subsegment of a specified length in seconds. + """Cut the specified length of the audiosegment randomly. + + Note that this is an in-place transformation. :param subsegment_length: Subsegment length in seconds. :type subsegment_length: float :param rng: Random number generator state. :type rng: random.Random - :raises ValueError: If the length of subsegment greater than - origineal segemnt. + :raises ValueError: If the length of subsegment is greater than + the origineal segemnt. """ rng = random.Random() if rng is None else rng if subsegment_length > self.duration: @@ -401,7 +420,7 @@ class AudioSegment(object): self.subsegment(start_time, start_time + subsegment_length) def convolve(self, impulse_segment, allow_resample=False): - """Convolve this audio segment with the given impulse_segment. + """Convolve this audio segment with the given impulse segment. Note that this is an in-place transformation. @@ -428,6 +447,8 @@ class AudioSegment(object): """Convolve and normalize the resulting audio segment so that it has the same average power as the input signal. + Note that this is an in-place transformation. + :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment :param allow_resample: Indicates whether resampling is allowed when @@ -445,10 +466,12 @@ class AudioSegment(object): allow_downsampling=False, max_gain_db=300.0, rng=None): - """Adds the given noise segment at a specific signal-to-noise ratio. + """Add the given noise segment at a specific signal-to-noise ratio. If the noise segment is longer than this segment, a random subsegment of matching length is sampled from it and used instead. + Note that this is an in-place transformation. + :param noise: Noise signal to add. :type noise: AudioSegment :param snr_dB: Signal-to-Noise Ratio, in decibels. @@ -480,9 +503,10 @@ class AudioSegment(object): " base signal (%f sec)." % (noise.duration, self.duration)) noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db) - noise.random_subsegment(self.duration, rng=rng) - noise.apply_gain(noise_gain_db) - self.superimposed(noise) + noise_new = copy.deepcopy(noise) + noise_new.random_subsegment(self.duration, rng=rng) + noise_new.apply_gain(noise_gain_db) + self.superimpose(noise_new) @property def samples(self): diff --git a/data_utils/speech.py b/data_utils/speech.py index 00190009..fc031ff4 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -67,7 +67,8 @@ class SpeechSegment(AudioSegment): @classmethod def concatenate(cls, *segments): - """Concatenate an arbitrary number of speech segments together. + """Concatenate an arbitrary number of speech segments together, both + audio and transcript will be concatenated. :param *segments: Input speech segments to be concatenated. :type *segments: tuple of SpeechSegment @@ -75,8 +76,7 @@ class SpeechSegment(AudioSegment): :rtype: SpeechSegment :raises ValueError: If the number of segments is zero, or if the sample_rate of any two segments does not match. - :raises TypeError: If every item in segments is not SpeechSegment - instance. + :raises TypeError: If any segment is not SpeechSegment instance. """ if len(segments) == 0: raise ValueError("No speech segments are given to concatenate.") @@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment): return cls(samples, sample_rate, transcripts) @classmethod - def slice_from_file(cls, filepath, start=None, end=None, transcript=""): + def slice_from_file(cls, filepath, start=None, end=None, transcript): """Loads a small section of an speech without having to load the entire file into the memory which can be incredibly wasteful. @@ -121,7 +121,7 @@ class SpeechSegment(AudioSegment): @classmethod def make_silence(cls, duration, sample_rate): """Creates a silent speech segment of the given duration and - sample rate. + sample rate, transcript will be an empty string. :param duration: Length of silence in seconds. :type duration: float From d35b747175e36b44c369ef9ceb2b9fd7f9cbd9ec Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 19 Jun 2017 23:24:58 +0800 Subject: [PATCH 10/11] Fix ci following: 1. Unify the dependency installation process in setup.sh. 2. Change the version of package scipy from 0.13.0b1 to 0.13.1 --- requirements.txt | 3 +-- setup.sh | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 setup.sh diff --git a/requirements.txt b/requirements.txt index c37e88ff..0183ecf0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ SoundFile==0.9.0.post1 wget==3.2 -scikits.samplerate==0.3.3 -scipy==0.13.0b1 +scipy==0.13.1 diff --git a/setup.sh b/setup.sh new file mode 100644 index 00000000..c59ef82f --- /dev/null +++ b/setup.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# install python dependencies +if [ -f 'requirements.txt' ]; then + pip install -r requirements.txt +fi + +if [ $? != 0 ]; then + exit 1 +fi + +# install scikits.samplerate +curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz" +if [ $? != 0 ]; then + echo "Download libsamplerate-0.1.9.tar.gz failed !!!" + exit 1 +fi +tar -xvf libsamplerate-0.1.9.tar.gz +cd libsamplerate-0.1.9 +./configure && make && make install +cd - +rm -rf libsamplerate-0.1.9 +rm libsamplerate-0.1.9.tar.gz +pip install scikits.samplerate==0.3.3 +if [ $? != 0 ]; then + echo "Install sckikits.samplerate failed !!!" + exit 1 +fi From a5dcd23bf2c44ac261882c89649e7c296ef936b7 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Mon, 19 Jun 2017 23:46:44 +0800 Subject: [PATCH 11/11] Follow comments. --- README.md | 2 +- setup.sh | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 23e0b412..0cdb203d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. ``` -pip install -r requirements.txt +sh setup.sh export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH ``` diff --git a/setup.sh b/setup.sh index c59ef82f..1ae2a5ee 100644 --- a/setup.sh +++ b/setup.sh @@ -4,8 +4,8 @@ if [ -f 'requirements.txt' ]; then pip install -r requirements.txt fi - if [ $? != 0 ]; then + echo "Install python dependencies failed !!!" exit 1 fi @@ -23,6 +23,8 @@ rm -rf libsamplerate-0.1.9 rm libsamplerate-0.1.9.tar.gz pip install scikits.samplerate==0.3.3 if [ $? != 0 ]; then - echo "Install sckikits.samplerate failed !!!" + echo "Install scikits.samplerate failed !!!" exit 1 fi + +echo "Install all dependencies successfully."