From 5ca270d30a34c71b0b851ed376fb7e7d90b3cf17 Mon Sep 17 00:00:00 2001
From: chrisxu2016 <823254351@qq.com>
Date: Sat, 17 Jun 2017 09:03:18 +0800
Subject: [PATCH] add audio file

---
 data_utils/audio.py  | 245 ++++++++++++++++++++-----------------------
 data_utils/speech.py |  55 ++++++++--
 2 files changed, 161 insertions(+), 139 deletions(-)

diff --git a/data_utils/audio.py b/data_utils/audio.py
index 066437dc..1f75da8a 100755
--- a/data_utils/audio.py
+++ b/data_utils/audio.py
@@ -47,32 +47,6 @@ class AudioSegment(object):
         """Return whether two objects are unequal."""
         return not self.__eq__(other)
 
-    def __len__(self):
-        """Returns length of segment in samples."""
-        return self.num_samples
-
-    def __add__(self, other):
-        """Add samples from another segment to those of this segment and return
-        a new segment (sample-wise addition, not segment concatenation).
-
-        :param other: Segment containing samples to be
-                      added in.
-        :type other: AudioSegment
-        :return: New segment containing resulting samples.
-        :rtype: AudioSegment
-        :raise TypeError: If sample rates of segments don't match,
-                          or if length of segments don't match.
-        """
-        if type(self) != type(other):
-            raise TypeError("Cannot add segment of different type: {}"
-                            .format(type(other)))
-        if self._sample_rate != other._sample_rate:
-            raise TypeError("Sample rates must match to add segments.")
-        if len(self._samples) != len(other._samples):
-            raise TypeError("Segment lengths must match to add segments.")
-        samples = self.samples + other.samples
-        return type(self)(samples, sample_rate=self._sample_rate)
-
     def __str__(self):
         """Return human-readable representation of segment."""
         return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
@@ -108,13 +82,13 @@ class AudioSegment(object):
     def concatenate(cls, *segments):
         """Concatenate an arbitrary number of audio segments together.
 
-        :param *segments: Input audio segments
+        :param *segments: Input audio segments.
         :type *segments: AudioSegment
-        :return: Audio segment instance.
+        :return: Audio segment instance as concatenating results.
         :rtype: AudioSegment
-        :raises ValueError: If number of segments is zero, or if sample_rate
-                            not match between two audio segments
-        :raises TypeError: If item of segments is not Audiosegment instance
+        :raises ValueError: If the number of segments is zero, or if the 
+                            sample_rate of any two segments does not match.
+        :raises TypeError: If every segment in is not Audiosegment instance.
         """
         # Perform basic sanity-checks.
         if len(segments) == 0:
@@ -155,12 +129,13 @@ class AudioSegment(object):
             format='WAV',
             subtype=subtype_map[dtype])
 
-    def slice_from_file(self, file, start=None, end=None):
+    @classmethod
+    def slice_from_file(cls, file, start=None, end=None):
         """Loads a small section of an audio without having to load
         the entire file into the memory which can be incredibly wasteful.
 
-        :param file: Input audio filepath
-        :type file: basestring
+        :param file: Input audio filepath or file object.
+        :type file: basestring|file
         :param start: Start time in seconds. If start is negative, it wraps
                       around from the end. If not provided, this function 
                       reads from the very beginning.
@@ -169,9 +144,11 @@ class AudioSegment(object):
                     from the end. If not provided, the default behvaior is
                     to read to the end of the file.
         :type end: float
-        :return: The specified slice of input audio in the audio.AudioSegment format.
+        :return: AudioSegment instance of the specified slice of the input
+                 audio file.
         :rtype: AudioSegment
-        :rainse ValueError: If the position is error, or if the time is out bounds.
+        :raise ValueError: If start or end is incorrectly set, e.g. out of
+                           bounds in time.
         """
         sndfile = soundfile.SoundFile(file)
         sample_rate = sndfile.samplerate
@@ -184,40 +161,60 @@ class AudioSegment(object):
             end += duration
         if start < 0.0:
             raise ValueError("The slice start position (%f s) is out of "
-                             "bounds. Filename: %s" % (start, file))
+                             "bounds." % start)
         if end < 0.0:
-            raise ValueError("The slice end position (%f s) is out of bounds "
-                             "Filename: %s" % (end, file))
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end)
         if start > end:
             raise ValueError("The slice start position (%f s) is later than "
                              "the slice end position (%f s)." % (start, end))
         if end > duration:
-            raise ValueError("The slice end time (%f s) is out of bounds "
-                             "(> %f s) Filename: %s" % (end, duration, file))
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end, duration))
         start_frame = int(start * sample_rate)
         end_frame = int(end * sample_rate)
         sndfile.seek(start_frame)
         data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
-        return type(self)(data, sample_rate)
+        return cls(data, sample_rate)
 
-    def make_silence(self, duration, sample_rate):
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
         """Creates a silent audio segment of the given duration and
         sample rate.
 
-        :param duration: Length of silence in seconds
+        :param duration: Length of silence in seconds.
         :type duration: float
-        :param sample_rate: Sample rate
+        :param sample_rate: Sample rate.
         :type sample_rate: float
-        :return: Silence of the given duration
+        :return: Silent AudioSegment instance of the given duration.
         :rtype: AudioSegment
         """
         samples = np.zeros(int(duration * sample_rate))
-        return type(self)(samples, sample_rate)
+        return cls(samples, sample_rate)
+
+    def superimposed(self, other):
+        """Add samples from another segment to those of this segment
+        (sample-wise addition, not segment concatenation).
+
+        :param other: Segment containing samples to be added in.
+        :type other: AudioSegments
+        :raise TypeError: If type of two segments don't match.
+        :raise ValueError: If the sample_rate of two segments not equal, or if
+                           the length of segments don't match.
+        """
+        if type(self) != type(other):
+            raise TypeError("Cannot add segments of different types: %s "
+                            "and %s." % (type(self), type(other)))
+        if self._sample_rate != other._sample_rate:
+            raise ValueError("Sample rates must match to add segments.")
+        if len(self._samples) != len(other._samples):
+            raise ValueError("Segment lengths must match to add segments.")
+        self._samples += other._samples
 
     def to_bytes(self, dtype='float32'):
         """Create a byte string containing the audio content.
         
-        :param dtype: Data type for export samples. Options: 'int16', 'int32',
+        :param dtype: Data type for export samples. Options: 'int16','int32',
                       'float32', 'float64'. Default is 'float32'.
         :type dtype: str
         :return: Byte string containing audio content.
@@ -258,16 +255,17 @@ class AudioSegment(object):
         self._samples = np.interp(new_indices, old_indices, self._samples)
 
     def normalize(self, target_db=-20, max_gain_db=300.0):
-        """Normalize audio to be desired RMS value in decibels.
+        """Normalize audio to be of the desired RMS value in decibels.
 
         Note that this is an in-place transformation.
 
-        :param target_db: Target RMS value in decibels. This value should
-                          be less than 0.0 as 0.0 is full-scale audio.
+        :param target_db: Target RMS value in decibels. This value should be
+                          less than 0.0 as 0.0 is full-scale audio.
         :type target_db: float
         :param max_gain_db: Max amount of gain in dB that can be applied for
-                            normalization. This is to prevent nans when attempting
-                            to normalize a signal consisting of all zeros.
+                            normalization. This is to prevent nans when
+                            attempting to normalize a signal consisting of
+                            all zeros.
         :type max_gain_db: float
         :raises ValueError: If the required gain to normalize the segment to
                             the target_db value exceeds max_gain_db.
@@ -275,9 +273,9 @@ class AudioSegment(object):
         gain = target_db - self.rms_db
         if gain > max_gain_db:
             raise ValueError(
-                "Unable to normalize segment to %f dB because it has an RMS "
-                "value of %f dB and the difference exceeds max_gain_db (%f dB)"
-                % (target_db, self.rms_db, max_gain_db))
+                "Unable to normalize segment to %f dB because the "
+                "the probable gain have exceeds max_gain_db (%f dB)" %
+                (target_db, max_gain_db))
         self.apply_gain(min(max_gain_db, target_db - self.rms_db))
 
     def normalize_online_bayesian(self,
@@ -285,30 +283,30 @@ class AudioSegment(object):
                                   prior_db,
                                   prior_samples,
                                   startup_delay=0.0):
-        """Normalize audio using a production-compatible online/causal algorithm.
-        This uses an exponential likelihood and gamma prior to make online estimates
-        of the RMS even when there are very few samples.
+        """Normalize audio using a production-compatible online/causal
+        algorithm. This uses an exponential likelihood and gamma prior to
+        make online estimates of the RMS even when there are very few samples.
 
         Note that this is an in-place transformation.
 
-        :param target_db: Target RMS value in decibels
+        :param target_db: Target RMS value in decibels.
         :type target_bd: float
-        :param prior_db: Prior RMS estimate in decibels
+        :param prior_db: Prior RMS estimate in decibels.
         :type prior_db: float
-        :param prior_samples: Prior strength in number of samples
+        :param prior_samples: Prior strength in number of samples.
         :type prior_samples: float
-        :param startup_delay: Default 0.0 s. If provided, this function will accrue
-                              statistics for the first startup_delay seconds before
-                              applying online normalization.
+        :param startup_delay: Default 0.0 s. If provided, this function will
+                              accrue statistics for the first startup_delay 
+                              seconds before applying online normalization.
         :type startup_delay: float
         """
-        # Estimate total RMS online
+        # Estimate total RMS online.
         startup_sample_idx = min(self.num_samples - 1,
                                  int(self.sample_rate * startup_delay))
         prior_mean_squared = 10.**(prior_db / 10.)
         prior_sum_of_squares = prior_mean_squared * prior_samples
         cumsum_of_squares = np.cumsum(self.samples**2)
-        sample_count = np.arange(len(self)) + 1
+        sample_count = np.arange(len(self.num_samples)) + 1
         if startup_sample_idx > 0:
             cumsum_of_squares[:startup_sample_idx] = \
                 cumsum_of_squares[startup_sample_idx]
@@ -317,42 +315,40 @@ class AudioSegment(object):
         mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
                                  (sample_count + prior_samples))
         rms_estimate_db = 10 * np.log10(mean_squared_estimate)
-        # Compute required time-varying gain
+        # Compute required time-varying gain.
         gain_db = target_db - rms_estimate_db
         self.apply_gain(gain_db)
 
     def resample(self, target_sample_rate, quality='sinc_medium'):
-        """Resample audio segment. This resamples the audio to a new 
-        sample rate.
+        """Resample the audio to a target sample rate.
 
         Note that this is an in-place transformation.
 
-        :param target_sample_rate: Target sample rate
+        :param target_sample_rate: Target sample rate.
         :type target_sample_rate: int
         :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}.
                         Sets resampling speed/quality tradeoff.
                         See http://www.mega-nerd.com/SRC/api_misc.html#Converters
-        :type quality: basestring
+        :type quality: str
         """
         resample_ratio = target_sample_rate / self._sample_rate
-        new_samples = scikits.samplerate.resample(
+        self._samples = scikits.samplerate.resample(
             self._samples, r=resample_ratio, type=quality)
-        self._samples = new_samples
         self._sample_rate = target_sample_rate
 
     def pad_silence(self, duration, sides='both'):
-        """Pads this audio sample with a period of silence.
+        """Pad this audio sample with a period of silence.
 
         Note that this is an in-place transformation.
 
-        :param duration: Length of silence in seconds to pad
+        :param duration: Length of silence in seconds to pad.
         :type duration: float
-        :param sides: Position for padding
-                     'beginning' - adds silence in the beginning
-                     'end' - adds silence in the end
+        :param sides: Position for padding:
+                     'beginning' - adds silence in the beginning;
+                     'end' - adds silence in the end;
                      'both' - adds silence in both the beginning and the end.
         :type sides: str
-        :raises ValueError: If the sides not surport
+        :raises ValueError: If sides is not supported.
         """
         if duration == 0.0:
             return self
@@ -367,51 +363,41 @@ class AudioSegment(object):
         else:
             raise ValueError("Unknown value for the kwarg %s" % sides)
         self._samples = padded._samples
-        self._sample_rate = padded._sample_rate
 
     def subsegment(self, start_sec=None, end_sec=None):
         """Return new AudioSegment containing audio between given boundaries.
 
-        :param start_sec: Beginning of subsegment in seconds,
-                          (beginning of segment if None).
+        :param start_sec: Beginning of subsegment in seconds.
         :type start_sec: float
-        :param end_sec: End of subsegment in seconds,
-                        (end of segment if None).
+        :param end_sec: End of subsegment in seconds.
         :type end_sec: float
-        :return: New AudioSegment containing specified subsegment.
-        :rtype: AudioSegment
         """
         start_sec = 0.0 if start_sec is None else start_sec
         end_sec = self.duration if end_sec is None else end_sec
-        # negative boundaries are relative to end of segment
         if start_sec < 0.0:
             start_sec = self.duration + start_sec
         if end_sec < 0.0:
             end_sec = self.duration + end_sec
         start_sample = int(round(start_sec * self._sample_rate))
         end_sample = int(round(end_sec * self._sample_rate))
-        samples = self._samples[start_sample:end_sample]
-        return type(self)(samples, sample_rate=self._sample_rate)
+        self._samples = self._samples[start_sample:end_sample]
 
     def random_subsegment(self, subsegment_length, rng=None):
         """Return a random subsegment of a specified length in seconds.
 
         :param subsegment_length: Subsegment length in seconds.
         :type subsegment_length: float
-        :param rng: Random number generator state
+        :param rng: Random number generator state.
         :type rng: random.Random
-        :return: New AudioSegment containing random subsegment
-                 of original segment
-        :rtype: AudioSegment
-        :raises ValueError: If the length of subsegment greater than origineal
-                            segemnt.
+        :raises ValueError: If the length of subsegment greater than
+                            origineal segemnt.
         """
         rng = random.Random() if rng is None else rng
         if subsegment_length > self.duration:
             raise ValueError("Length of subsegment must not be greater "
                              "than original segment.")
         start_time = rng.uniform(0.0, self.duration - subsegment_length)
-        return self.subsegment(start_time, start_time + subsegment_length)
+        self.subsegment(start_time, start_time + subsegment_length)
 
     def convolve(self, impulse_segment, allow_resample=False):
         """Convolve this audio segment with the given filter.
@@ -420,10 +406,10 @@ class AudioSegment(object):
 
         :param impulse_segment: Impulse response segments.
         :type impulse_segment: AudioSegment
-        :param allow_resample: indicates whether resampling is allowed when
-                                 the impulse_segment has a different sample 
-                                 rate from this signal.
-        :type allow_resample: boolean
+        :param allow_resample: Indicates whether resampling is allowed when
+                               the impulse_segment has a different sample 
+                               rate from this signal.
+        :type allow_resample: bool
         :raises ValueError: If the sample rate is not match between two
                             audio segments and resample is not allowed.
         """
@@ -443,9 +429,10 @@ class AudioSegment(object):
 
         :param impulse_segment: Impulse response segments.
         :type impulse_segment: AudioSegment
-        :param allow_resample: indicates whether resampling is allowed when
-                               the impulse_segment has a different sample rate from this signal.
-        :type allow_resample: boolean
+        :param allow_resample: Indicates whether resampling is allowed when
+                               the impulse_segment has a different sample
+                               rate from this signal.
+        :type allow_resample: bool
         """
         target_db = self.rms_db
         self.convolve(impulse_segment, allow_resample=allow_resample)
@@ -465,42 +452,36 @@ class AudioSegment(object):
         :type noise: AudioSegment
         :param snr_dB: Signal-to-Noise Ratio, in decibels.
         :type snr_dB: float
-        :param allow_downsampling: whether to allow the noise signal to be downsampled
-                                   to match the base signal sample rate.
-        :type allow_downsampling: boolean
-        :param max_gain_db: Maximum amount of gain to apply to noise signal before
-                            adding it in. This is to prevent attempting to apply infinite
-                            gain to a zero signal.
+        :param allow_downsampling: Whether to allow the noise signal to be
+                                   downsampled to match the base signal sample
+                                   rate.
+        :type allow_downsampling: bool
+        :param max_gain_db: Maximum amount of gain to apply to noise signal
+                            before adding it in. This is to prevent attempting
+                            to apply infinite gain to a zero signal.
         :type max_gain_db: float
         :param rng: Random number generator state.
-        :type rng: random.Random
-        :raises ValueError: If the sample rate does not match between the two audio segments
-                            and resample is not allowed, or if the duration of noise segments
-                            is shorter than original audio segments.
+        :type rng: None|random.Random
+        :raises ValueError: If the sample rate does not match between the two
+                            audio segments and resample is not allowed, or if
+                            the duration of noise segments is shorter than
+                            original audio segments.
         """
         rng = random.Random() if rng is None else rng
         if allow_downsampling and noise.sample_rate > self.sample_rate:
             noise = noise.resample(self.sample_rate)
         if noise.sample_rate != self.sample_rate:
-            raise ValueError("Noise sample rate (%d Hz) is not equal to "
-                             "base signal sample rate (%d Hz)." %
-                             (noise.sample_rate, self.sample_rate))
+            raise ValueError("Noise sample rate (%d Hz) is not equal to base "
+                             "signal sample rate (%d Hz)." % (noise.sample_rate,
+                                                              self.sample_rate))
         if noise.duration < self.duration:
-            raise ValueError("Noise signal (%f sec) must be at "
-                             "least as long as base signal (%f sec)." %
+            raise ValueError("Noise signal (%f sec) must be at least as long as"
+                             " base signal (%f sec)." %
                              (noise.duration, self.duration))
-        noise_gain_db = self.rms_db - noise.rms_db - snr_dB
-        noise_gain_db = min(max_gain_db, noise_gain_db)
-        noise_subsegment = noise.random_subsegment(self.duration, rng=rng)
-        output = self + self.tranform_noise(noise_subsegment, noise_gain_db)
-        self._samples = output._samples
-        self._sample_rate = output._sample_rate
-
-    def tranform_noise(self, noise_subsegment, noise_gain_db):
-        """ tranform noise file
-        """
-        return type(self)(noise_subsegment._samples * (10.**(
-            noise_gain_db / 20.)), noise_subsegment._sample_rate)
+        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
+        noise.random_subsegment(self.duration, rng=rng)
+        noise.apply_gain(noise_gain_db)
+        self.superimposed(noise)
 
     @property
     def samples(self):
@@ -571,7 +552,7 @@ class AudioSegment(object):
         Audio sample type is usually integer or float-point. For integer
         type, float32 will be rescaled from [-1, 1] to the maximum range
         supported by the integer type.
-        
+
         This is for writing a audio file.
         """
         dtype = np.dtype(dtype)
diff --git a/data_utils/speech.py b/data_utils/speech.py
index 5d1fc15a..443df68c 100755
--- a/data_utils/speech.py
+++ b/data_utils/speech.py
@@ -67,20 +67,20 @@ class SpeechSegment(AudioSegment):
 
     @classmethod
     def concatenate(cls, *segments):
-        """Concatenate an arbitrary number of audio segments together.
+        """Concatenate an arbitrary number of speech segments together.
 
-        :param *segments: Input speech segments
+        :param *segments: Input speech segments.
         :type *segments: SpeechSegment
         :return: Speech segment instance.
         :rtype: SpeechSegment
-        :raises ValueError: If number of segments is zero, or if sample_rate
-                            not match between two audio segments
-        :raises TypeError: If item of segments is not Audiosegment instance
+        :raises ValueError: If the number of segments is zero, or if the 
+                            sample_rate of any two segments does not match.
+        :raises TypeError: If every segment in is not Audiosegment instance.
         """
-        # Perform basic sanity-checks.
         if len(segments) == 0:
             raise ValueError("No audio segments are given to concatenate.")
         sample_rate = segments[0]._sample_rate
+        transcripts = ""
         for seg in segments:
             if sample_rate != seg._sample_rate:
                 raise ValueError("Can't concatenate segments with "
@@ -88,8 +88,49 @@ class SpeechSegment(AudioSegment):
             if type(seg) is not cls:
                 raise TypeError("Only speech segments of the same type "
                                 "instance can be concatenated.")
+            transcripts += seg._transcript
         samples = np.concatenate([seg.samples for seg in segments])
-        return cls(samples, sample_rate, seg._transcript)
+        return cls(samples, sample_rate, transcripts)
+
+    @classmethod
+    def slice_from_file(cls, filepath, start=None, end=None, transcript=""):
+        """Loads a small section of an speech without having to load
+        the entire file into the memory which can be incredibly wasteful.
+
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: basestring|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function 
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :param transcript: Transcript text for the speech. if not provided, 
+                           the defaults is an empty string.
+        :type transript: basestring
+        :return: SpeechSegment instance of the specified slice of the input
+                 speech file.
+        :rtype: SpeechSegment
+        """
+        audio = Audiosegment.slice_from_file(filepath, start, end)
+        return cls(audio.samples, audio.sample_rate, transcripts)
+
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent speech segment of the given duration and
+        sample rate.
+
+        :param duration: Length of silence in seconds.
+        :type duration: float
+        :param sample_rate: Sample rate.
+        :type sample_rate: float
+        :return: Silence of the given duration.
+        :rtype: AudioSegment
+        """
+        audio = AudioSegment.make_silence(duration, sample_rate)
+        return cls(audio.samples, audio.sample_rate, "")
 
     @property
     def transcript(self):