diff --git a/data_utils/audio.py b/data_utils/audio.py index 066437dc..1f75da8a 100755 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -47,32 +47,6 @@ class AudioSegment(object): """Return whether two objects are unequal.""" return not self.__eq__(other) - def __len__(self): - """Returns length of segment in samples.""" - return self.num_samples - - def __add__(self, other): - """Add samples from another segment to those of this segment and return - a new segment (sample-wise addition, not segment concatenation). - - :param other: Segment containing samples to be - added in. - :type other: AudioSegment - :return: New segment containing resulting samples. - :rtype: AudioSegment - :raise TypeError: If sample rates of segments don't match, - or if length of segments don't match. - """ - if type(self) != type(other): - raise TypeError("Cannot add segment of different type: {}" - .format(type(other))) - if self._sample_rate != other._sample_rate: - raise TypeError("Sample rates must match to add segments.") - if len(self._samples) != len(other._samples): - raise TypeError("Segment lengths must match to add segments.") - samples = self.samples + other.samples - return type(self)(samples, sample_rate=self._sample_rate) - def __str__(self): """Return human-readable representation of segment.""" return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " @@ -108,13 +82,13 @@ class AudioSegment(object): def concatenate(cls, *segments): """Concatenate an arbitrary number of audio segments together. - :param *segments: Input audio segments + :param *segments: Input audio segments. :type *segments: AudioSegment - :return: Audio segment instance. + :return: Audio segment instance as concatenating results. :rtype: AudioSegment - :raises ValueError: If number of segments is zero, or if sample_rate - not match between two audio segments - :raises TypeError: If item of segments is not Audiosegment instance + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any two segments does not match. + :raises TypeError: If every segment in is not Audiosegment instance. """ # Perform basic sanity-checks. if len(segments) == 0: @@ -155,12 +129,13 @@ class AudioSegment(object): format='WAV', subtype=subtype_map[dtype]) - def slice_from_file(self, file, start=None, end=None): + @classmethod + def slice_from_file(cls, file, start=None, end=None): """Loads a small section of an audio without having to load the entire file into the memory which can be incredibly wasteful. - :param file: Input audio filepath - :type file: basestring + :param file: Input audio filepath or file object. + :type file: basestring|file :param start: Start time in seconds. If start is negative, it wraps around from the end. If not provided, this function reads from the very beginning. @@ -169,9 +144,11 @@ class AudioSegment(object): from the end. If not provided, the default behvaior is to read to the end of the file. :type end: float - :return: The specified slice of input audio in the audio.AudioSegment format. + :return: AudioSegment instance of the specified slice of the input + audio file. :rtype: AudioSegment - :rainse ValueError: If the position is error, or if the time is out bounds. + :raise ValueError: If start or end is incorrectly set, e.g. out of + bounds in time. """ sndfile = soundfile.SoundFile(file) sample_rate = sndfile.samplerate @@ -184,40 +161,60 @@ class AudioSegment(object): end += duration if start < 0.0: raise ValueError("The slice start position (%f s) is out of " - "bounds. Filename: %s" % (start, file)) + "bounds." % start) if end < 0.0: - raise ValueError("The slice end position (%f s) is out of bounds " - "Filename: %s" % (end, file)) + raise ValueError("The slice end position (%f s) is out of bounds." % + end) if start > end: raise ValueError("The slice start position (%f s) is later than " "the slice end position (%f s)." % (start, end)) if end > duration: - raise ValueError("The slice end time (%f s) is out of bounds " - "(> %f s) Filename: %s" % (end, duration, file)) + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end, duration)) start_frame = int(start * sample_rate) end_frame = int(end * sample_rate) sndfile.seek(start_frame) data = sndfile.read(frames=end_frame - start_frame, dtype='float32') - return type(self)(data, sample_rate) + return cls(data, sample_rate) - def make_silence(self, duration, sample_rate): + @classmethod + def make_silence(cls, duration, sample_rate): """Creates a silent audio segment of the given duration and sample rate. - :param duration: Length of silence in seconds + :param duration: Length of silence in seconds. :type duration: float - :param sample_rate: Sample rate + :param sample_rate: Sample rate. :type sample_rate: float - :return: Silence of the given duration + :return: Silent AudioSegment instance of the given duration. :rtype: AudioSegment """ samples = np.zeros(int(duration * sample_rate)) - return type(self)(samples, sample_rate) + return cls(samples, sample_rate) + + def superimposed(self, other): + """Add samples from another segment to those of this segment + (sample-wise addition, not segment concatenation). + + :param other: Segment containing samples to be added in. + :type other: AudioSegments + :raise TypeError: If type of two segments don't match. + :raise ValueError: If the sample_rate of two segments not equal, or if + the length of segments don't match. + """ + if type(self) != type(other): + raise TypeError("Cannot add segments of different types: %s " + "and %s." % (type(self), type(other))) + if self._sample_rate != other._sample_rate: + raise ValueError("Sample rates must match to add segments.") + if len(self._samples) != len(other._samples): + raise ValueError("Segment lengths must match to add segments.") + self._samples += other._samples def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. - :param dtype: Data type for export samples. Options: 'int16', 'int32', + :param dtype: Data type for export samples. Options: 'int16','int32', 'float32', 'float64'. Default is 'float32'. :type dtype: str :return: Byte string containing audio content. @@ -258,16 +255,17 @@ class AudioSegment(object): self._samples = np.interp(new_indices, old_indices, self._samples) def normalize(self, target_db=-20, max_gain_db=300.0): - """Normalize audio to be desired RMS value in decibels. + """Normalize audio to be of the desired RMS value in decibels. Note that this is an in-place transformation. - :param target_db: Target RMS value in decibels. This value should - be less than 0.0 as 0.0 is full-scale audio. + :param target_db: Target RMS value in decibels. This value should be + less than 0.0 as 0.0 is full-scale audio. :type target_db: float :param max_gain_db: Max amount of gain in dB that can be applied for - normalization. This is to prevent nans when attempting - to normalize a signal consisting of all zeros. + normalization. This is to prevent nans when + attempting to normalize a signal consisting of + all zeros. :type max_gain_db: float :raises ValueError: If the required gain to normalize the segment to the target_db value exceeds max_gain_db. @@ -275,9 +273,9 @@ class AudioSegment(object): gain = target_db - self.rms_db if gain > max_gain_db: raise ValueError( - "Unable to normalize segment to %f dB because it has an RMS " - "value of %f dB and the difference exceeds max_gain_db (%f dB)" - % (target_db, self.rms_db, max_gain_db)) + "Unable to normalize segment to %f dB because the " + "the probable gain have exceeds max_gain_db (%f dB)" % + (target_db, max_gain_db)) self.apply_gain(min(max_gain_db, target_db - self.rms_db)) def normalize_online_bayesian(self, @@ -285,30 +283,30 @@ class AudioSegment(object): prior_db, prior_samples, startup_delay=0.0): - """Normalize audio using a production-compatible online/causal algorithm. - This uses an exponential likelihood and gamma prior to make online estimates - of the RMS even when there are very few samples. + """Normalize audio using a production-compatible online/causal + algorithm. This uses an exponential likelihood and gamma prior to + make online estimates of the RMS even when there are very few samples. Note that this is an in-place transformation. - :param target_db: Target RMS value in decibels + :param target_db: Target RMS value in decibels. :type target_bd: float - :param prior_db: Prior RMS estimate in decibels + :param prior_db: Prior RMS estimate in decibels. :type prior_db: float - :param prior_samples: Prior strength in number of samples + :param prior_samples: Prior strength in number of samples. :type prior_samples: float - :param startup_delay: Default 0.0 s. If provided, this function will accrue - statistics for the first startup_delay seconds before - applying online normalization. + :param startup_delay: Default 0.0 s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. :type startup_delay: float """ - # Estimate total RMS online + # Estimate total RMS online. startup_sample_idx = min(self.num_samples - 1, int(self.sample_rate * startup_delay)) prior_mean_squared = 10.**(prior_db / 10.) prior_sum_of_squares = prior_mean_squared * prior_samples cumsum_of_squares = np.cumsum(self.samples**2) - sample_count = np.arange(len(self)) + 1 + sample_count = np.arange(len(self.num_samples)) + 1 if startup_sample_idx > 0: cumsum_of_squares[:startup_sample_idx] = \ cumsum_of_squares[startup_sample_idx] @@ -317,42 +315,40 @@ class AudioSegment(object): mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / (sample_count + prior_samples)) rms_estimate_db = 10 * np.log10(mean_squared_estimate) - # Compute required time-varying gain + # Compute required time-varying gain. gain_db = target_db - rms_estimate_db self.apply_gain(gain_db) def resample(self, target_sample_rate, quality='sinc_medium'): - """Resample audio segment. This resamples the audio to a new - sample rate. + """Resample the audio to a target sample rate. Note that this is an in-place transformation. - :param target_sample_rate: Target sample rate + :param target_sample_rate: Target sample rate. :type target_sample_rate: int :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. Sets resampling speed/quality tradeoff. See http://www.mega-nerd.com/SRC/api_misc.html#Converters - :type quality: basestring + :type quality: str """ resample_ratio = target_sample_rate / self._sample_rate - new_samples = scikits.samplerate.resample( + self._samples = scikits.samplerate.resample( self._samples, r=resample_ratio, type=quality) - self._samples = new_samples self._sample_rate = target_sample_rate def pad_silence(self, duration, sides='both'): - """Pads this audio sample with a period of silence. + """Pad this audio sample with a period of silence. Note that this is an in-place transformation. - :param duration: Length of silence in seconds to pad + :param duration: Length of silence in seconds to pad. :type duration: float - :param sides: Position for padding - 'beginning' - adds silence in the beginning - 'end' - adds silence in the end + :param sides: Position for padding: + 'beginning' - adds silence in the beginning; + 'end' - adds silence in the end; 'both' - adds silence in both the beginning and the end. :type sides: str - :raises ValueError: If the sides not surport + :raises ValueError: If sides is not supported. """ if duration == 0.0: return self @@ -367,51 +363,41 @@ class AudioSegment(object): else: raise ValueError("Unknown value for the kwarg %s" % sides) self._samples = padded._samples - self._sample_rate = padded._sample_rate def subsegment(self, start_sec=None, end_sec=None): """Return new AudioSegment containing audio between given boundaries. - :param start_sec: Beginning of subsegment in seconds, - (beginning of segment if None). + :param start_sec: Beginning of subsegment in seconds. :type start_sec: float - :param end_sec: End of subsegment in seconds, - (end of segment if None). + :param end_sec: End of subsegment in seconds. :type end_sec: float - :return: New AudioSegment containing specified subsegment. - :rtype: AudioSegment """ start_sec = 0.0 if start_sec is None else start_sec end_sec = self.duration if end_sec is None else end_sec - # negative boundaries are relative to end of segment if start_sec < 0.0: start_sec = self.duration + start_sec if end_sec < 0.0: end_sec = self.duration + end_sec start_sample = int(round(start_sec * self._sample_rate)) end_sample = int(round(end_sec * self._sample_rate)) - samples = self._samples[start_sample:end_sample] - return type(self)(samples, sample_rate=self._sample_rate) + self._samples = self._samples[start_sample:end_sample] def random_subsegment(self, subsegment_length, rng=None): """Return a random subsegment of a specified length in seconds. :param subsegment_length: Subsegment length in seconds. :type subsegment_length: float - :param rng: Random number generator state + :param rng: Random number generator state. :type rng: random.Random - :return: New AudioSegment containing random subsegment - of original segment - :rtype: AudioSegment - :raises ValueError: If the length of subsegment greater than origineal - segemnt. + :raises ValueError: If the length of subsegment greater than + origineal segemnt. """ rng = random.Random() if rng is None else rng if subsegment_length > self.duration: raise ValueError("Length of subsegment must not be greater " "than original segment.") start_time = rng.uniform(0.0, self.duration - subsegment_length) - return self.subsegment(start_time, start_time + subsegment_length) + self.subsegment(start_time, start_time + subsegment_length) def convolve(self, impulse_segment, allow_resample=False): """Convolve this audio segment with the given filter. @@ -420,10 +406,10 @@ class AudioSegment(object): :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment - :param allow_resample: indicates whether resampling is allowed when - the impulse_segment has a different sample - rate from this signal. - :type allow_resample: boolean + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool :raises ValueError: If the sample rate is not match between two audio segments and resample is not allowed. """ @@ -443,9 +429,10 @@ class AudioSegment(object): :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment - :param allow_resample: indicates whether resampling is allowed when - the impulse_segment has a different sample rate from this signal. - :type allow_resample: boolean + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool """ target_db = self.rms_db self.convolve(impulse_segment, allow_resample=allow_resample) @@ -465,42 +452,36 @@ class AudioSegment(object): :type noise: AudioSegment :param snr_dB: Signal-to-Noise Ratio, in decibels. :type snr_dB: float - :param allow_downsampling: whether to allow the noise signal to be downsampled - to match the base signal sample rate. - :type allow_downsampling: boolean - :param max_gain_db: Maximum amount of gain to apply to noise signal before - adding it in. This is to prevent attempting to apply infinite - gain to a zero signal. + :param allow_downsampling: Whether to allow the noise signal to be + downsampled to match the base signal sample + rate. + :type allow_downsampling: bool + :param max_gain_db: Maximum amount of gain to apply to noise signal + before adding it in. This is to prevent attempting + to apply infinite gain to a zero signal. :type max_gain_db: float :param rng: Random number generator state. - :type rng: random.Random - :raises ValueError: If the sample rate does not match between the two audio segments - and resample is not allowed, or if the duration of noise segments - is shorter than original audio segments. + :type rng: None|random.Random + :raises ValueError: If the sample rate does not match between the two + audio segments and resample is not allowed, or if + the duration of noise segments is shorter than + original audio segments. """ rng = random.Random() if rng is None else rng if allow_downsampling and noise.sample_rate > self.sample_rate: noise = noise.resample(self.sample_rate) if noise.sample_rate != self.sample_rate: - raise ValueError("Noise sample rate (%d Hz) is not equal to " - "base signal sample rate (%d Hz)." % - (noise.sample_rate, self.sample_rate)) + raise ValueError("Noise sample rate (%d Hz) is not equal to base " + "signal sample rate (%d Hz)." % (noise.sample_rate, + self.sample_rate)) if noise.duration < self.duration: - raise ValueError("Noise signal (%f sec) must be at " - "least as long as base signal (%f sec)." % + raise ValueError("Noise signal (%f sec) must be at least as long as" + " base signal (%f sec)." % (noise.duration, self.duration)) - noise_gain_db = self.rms_db - noise.rms_db - snr_dB - noise_gain_db = min(max_gain_db, noise_gain_db) - noise_subsegment = noise.random_subsegment(self.duration, rng=rng) - output = self + self.tranform_noise(noise_subsegment, noise_gain_db) - self._samples = output._samples - self._sample_rate = output._sample_rate - - def tranform_noise(self, noise_subsegment, noise_gain_db): - """ tranform noise file - """ - return type(self)(noise_subsegment._samples * (10.**( - noise_gain_db / 20.)), noise_subsegment._sample_rate) + noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db) + noise.random_subsegment(self.duration, rng=rng) + noise.apply_gain(noise_gain_db) + self.superimposed(noise) @property def samples(self): @@ -571,7 +552,7 @@ class AudioSegment(object): Audio sample type is usually integer or float-point. For integer type, float32 will be rescaled from [-1, 1] to the maximum range supported by the integer type. - + This is for writing a audio file. """ dtype = np.dtype(dtype) diff --git a/data_utils/speech.py b/data_utils/speech.py index 5d1fc15a..443df68c 100755 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -67,20 +67,20 @@ class SpeechSegment(AudioSegment): @classmethod def concatenate(cls, *segments): - """Concatenate an arbitrary number of audio segments together. + """Concatenate an arbitrary number of speech segments together. - :param *segments: Input speech segments + :param *segments: Input speech segments. :type *segments: SpeechSegment :return: Speech segment instance. :rtype: SpeechSegment - :raises ValueError: If number of segments is zero, or if sample_rate - not match between two audio segments - :raises TypeError: If item of segments is not Audiosegment instance + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any two segments does not match. + :raises TypeError: If every segment in is not Audiosegment instance. """ - # Perform basic sanity-checks. if len(segments) == 0: raise ValueError("No audio segments are given to concatenate.") sample_rate = segments[0]._sample_rate + transcripts = "" for seg in segments: if sample_rate != seg._sample_rate: raise ValueError("Can't concatenate segments with " @@ -88,8 +88,49 @@ class SpeechSegment(AudioSegment): if type(seg) is not cls: raise TypeError("Only speech segments of the same type " "instance can be concatenated.") + transcripts += seg._transcript samples = np.concatenate([seg.samples for seg in segments]) - return cls(samples, sample_rate, seg._transcript) + return cls(samples, sample_rate, transcripts) + + @classmethod + def slice_from_file(cls, filepath, start=None, end=None, transcript=""): + """Loads a small section of an speech without having to load + the entire file into the memory which can be incredibly wasteful. + + :param filepath: Filepath or file object to audio file. + :type filepath: basestring|file + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :param transcript: Transcript text for the speech. if not provided, + the defaults is an empty string. + :type transript: basestring + :return: SpeechSegment instance of the specified slice of the input + speech file. + :rtype: SpeechSegment + """ + audio = Audiosegment.slice_from_file(filepath, start, end) + return cls(audio.samples, audio.sample_rate, transcripts) + + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent speech segment of the given duration and + sample rate. + + :param duration: Length of silence in seconds. + :type duration: float + :param sample_rate: Sample rate. + :type sample_rate: float + :return: Silence of the given duration. + :rtype: AudioSegment + """ + audio = AudioSegment.make_silence(duration, sample_rate) + return cls(audio.samples, audio.sample_rate, "") @property def transcript(self):