From ddb2bdc1906223733dd5b1a2ad15a54492681f5b Mon Sep 17 00:00:00 2001
From: chrisxu2016 <823254351@qq.com>
Date: Mon, 19 Jun 2017 00:08:05 +0800
Subject: [PATCH] add audio file

---
 data_utils/audio.py  | 64 ++++++++++++++++++++++++++++++--------------
 data_utils/speech.py | 10 +++----
 2 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/data_utils/audio.py b/data_utils/audio.py
index 37f4f0ba..5d02feb6 100755
--- a/data_utils/audio.py
+++ b/data_utils/audio.py
@@ -9,6 +9,7 @@ import soundfile
 import scikits.samplerate
 from scipy import signal
 import random
+import copy
 
 
 class AudioSegment(object):
@@ -87,9 +88,8 @@ class AudioSegment(object):
         :return: Audio segment instance as concatenating results.
         :rtype: AudioSegment
         :raises ValueError: If the number of segments is zero, or if the 
-                            sample_rate of any two segment does not match.
-        :raises TypeError: If every item in segments is not AudioSegment
-                           instance.
+                            sample_rate of any segments does not match.
+        :raises TypeError: If any segment is not AudioSegment instance.
         """
         # Perform basic sanity-checks.
         if len(segments) == 0:
@@ -101,7 +101,7 @@ class AudioSegment(object):
                                  "different sample rates")
             if type(seg) is not cls:
                 raise TypeError("Only audio segments of the same type "
-                                "instance can be concatenated.")
+                                "can be concatenated.")
         samples = np.concatenate([seg.samples for seg in segments])
         return cls(samples, sample_rate)
 
@@ -180,8 +180,7 @@ class AudioSegment(object):
 
     @classmethod
     def make_silence(cls, duration, sample_rate):
-        """Creates a silent audio segment of the given duration and
-        sample rate.
+        """Creates a silent audio segment of the given duration and sample rate.
 
         :param duration: Length of silence in seconds.
         :type duration: float
@@ -193,15 +192,17 @@ class AudioSegment(object):
         samples = np.zeros(int(duration * sample_rate))
         return cls(samples, sample_rate)
 
-    def superimposed(self, other):
+    def superimpose(self, other):
         """Add samples from another segment to those of this segment
         (sample-wise addition, not segment concatenation).
 
+        Note that this is an in-place transformation.
+
         :param other: Segment containing samples to be added in.
         :type other: AudioSegments
         :raise TypeError: If type of two segments don't match.
-        :raise ValueError: If the sample_rate of two segments not equal, or if
-                           the length of segments don't match.
+        :raise ValueError: If the sample rates of the two segments are not
+                           equal, or if the lengths of segments don't match.
         """
         if type(self) != type(other):
             raise TypeError("Cannot add segments of different types: %s "
@@ -215,7 +216,7 @@ class AudioSegment(object):
     def to_bytes(self, dtype='float32'):
         """Create a byte string containing the audio content.
         
-        :param dtype: Data type for export samples. Options: 'int16','int32',
+        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                       'float32', 'float64'. Default is 'float32'.
         :type dtype: str
         :return: Byte string containing audio content.
@@ -362,16 +363,20 @@ class AudioSegment(object):
         elif sides == "both":
             padded = cls.concatenate(silence, self, silence)
         else:
-            raise ValueError("Unknown value for the kwarg %s" % sides)
+            raise ValueError("Unknown value for the sides %s" % sides)
         self._samples = padded._samples
 
     def subsegment(self, start_sec=None, end_sec=None):
-        """Return new AudioSegment containing audio between given boundaries.
+        """Cut the AudioSegment between given boundaries.
+
+        Note that this is an in-place transformation.
 
         :param start_sec: Beginning of subsegment in seconds.
         :type start_sec: float
         :param end_sec: End of subsegment in seconds.
         :type end_sec: float
+        :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out
+                           of bounds in time.
         """
         start_sec = 0.0 if start_sec is None else start_sec
         end_sec = self.duration if end_sec is None else end_sec
@@ -379,19 +384,33 @@ class AudioSegment(object):
             start_sec = self.duration + start_sec
         if end_sec < 0.0:
             end_sec = self.duration + end_sec
+        if start_sec < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start_sec)
+        if end_sec < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end_sec)
+        if start_sec > end_sec:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the end position (%f s)." % (start_sec, end_sec))
+        if end_sec > self.duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end_sec, self.duration))
         start_sample = int(round(start_sec * self._sample_rate))
         end_sample = int(round(end_sec * self._sample_rate))
         self._samples = self._samples[start_sample:end_sample]
 
     def random_subsegment(self, subsegment_length, rng=None):
-        """Return a random subsegment of a specified length in seconds.
+        """Cut the specified length of the audiosegment randomly.
+
+        Note that this is an in-place transformation.
 
         :param subsegment_length: Subsegment length in seconds.
         :type subsegment_length: float
         :param rng: Random number generator state.
         :type rng: random.Random
-        :raises ValueError: If the length of subsegment greater than
-                            origineal segemnt.
+        :raises ValueError: If the length of subsegment is greater than
+                            the origineal segemnt.
         """
         rng = random.Random() if rng is None else rng
         if subsegment_length > self.duration:
@@ -401,7 +420,7 @@ class AudioSegment(object):
         self.subsegment(start_time, start_time + subsegment_length)
 
     def convolve(self, impulse_segment, allow_resample=False):
-        """Convolve this audio segment with the given impulse_segment.
+        """Convolve this audio segment with the given impulse segment.
 
         Note that this is an in-place transformation.
 
@@ -428,6 +447,8 @@ class AudioSegment(object):
         """Convolve and normalize the resulting audio segment so that it
         has the same average power as the input signal.
 
+        Note that this is an in-place transformation.
+
         :param impulse_segment: Impulse response segments.
         :type impulse_segment: AudioSegment
         :param allow_resample: Indicates whether resampling is allowed when
@@ -445,10 +466,12 @@ class AudioSegment(object):
                   allow_downsampling=False,
                   max_gain_db=300.0,
                   rng=None):
-        """Adds the given noise segment at a specific signal-to-noise ratio.
+        """Add the given noise segment at a specific signal-to-noise ratio.
         If the noise segment is longer than this segment, a random subsegment
         of matching length is sampled from it and used instead.
 
+        Note that this is an in-place transformation.
+
         :param noise: Noise signal to add.
         :type noise: AudioSegment
         :param snr_dB: Signal-to-Noise Ratio, in decibels.
@@ -480,9 +503,10 @@ class AudioSegment(object):
                              " base signal (%f sec)." %
                              (noise.duration, self.duration))
         noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
-        noise.random_subsegment(self.duration, rng=rng)
-        noise.apply_gain(noise_gain_db)
-        self.superimposed(noise)
+        noise_new = copy.deepcopy(noise)
+        noise_new.random_subsegment(self.duration, rng=rng)
+        noise_new.apply_gain(noise_gain_db)
+        self.superimpose(noise_new)
 
     @property
     def samples(self):
diff --git a/data_utils/speech.py b/data_utils/speech.py
index 00190009..fc031ff4 100755
--- a/data_utils/speech.py
+++ b/data_utils/speech.py
@@ -67,7 +67,8 @@ class SpeechSegment(AudioSegment):
 
     @classmethod
     def concatenate(cls, *segments):
-        """Concatenate an arbitrary number of speech segments together.
+        """Concatenate an arbitrary number of speech segments together, both
+        audio and transcript will be concatenated.
 
         :param *segments: Input speech segments to be concatenated.
         :type *segments: tuple of SpeechSegment
@@ -75,8 +76,7 @@ class SpeechSegment(AudioSegment):
         :rtype: SpeechSegment
         :raises ValueError: If the number of segments is zero, or if the 
                             sample_rate of any two segments does not match.
-        :raises TypeError: If every item in segments is not SpeechSegment
-                           instance.
+        :raises TypeError: If any segment is not SpeechSegment instance.
         """
         if len(segments) == 0:
             raise ValueError("No speech segments are given to concatenate.")
@@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment):
         return cls(samples, sample_rate, transcripts)
 
     @classmethod
-    def slice_from_file(cls, filepath, start=None, end=None, transcript=""):
+    def slice_from_file(cls, filepath, start=None, end=None, transcript):
         """Loads a small section of an speech without having to load
         the entire file into the memory which can be incredibly wasteful.
 
@@ -121,7 +121,7 @@ class SpeechSegment(AudioSegment):
     @classmethod
     def make_silence(cls, duration, sample_rate):
         """Creates a silent speech segment of the given duration and
-        sample rate.
+        sample rate, transcript will be an empty string.
 
         :param duration: Length of silence in seconds.
         :type duration: float