Merge branch 'develop' into ctc

4 years ago · 764a5d4271
parent b1c80c45e0 1ac9e781d4
commit 764a5d4271
3 changed files with 47 additions and 23 deletions
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@ -21,7 +21,6 @@ import struct
 import numpy as np
 import resampy
 import soundfile
-import soxbindings as sox
 from scipy import signal

 from .utility import convert_samples_from_float32
@ -98,7 +97,7 @@ class AudioSegment():
        :param file: Input audio filepath or file object.
        :type file: str|file
        :param start: Start time in seconds. If start is negative, it wraps
-                      around from the end. If not provided, this function 
+                      around from the end. If not provided, this function
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
@ -199,7 +198,7 @@ class AudioSegment():
    @classmethod
    def from_bytes(cls, bytes):
        """Create audio segment from a byte string containing audio samples.
-        
+
        :param bytes: Byte string containing audio samples.
        :type bytes: str
        :return: Audio segment instance.
@ -217,7 +216,7 @@ class AudioSegment():
        :type *segments: tuple of AudioSegment
        :return: Audio segment instance as concatenating results.
        :rtype: AudioSegment
-        :raises ValueError: If the number of segments is zero, or if the 
+        :raises ValueError: If the number of segments is zero, or if the
                            sample_rate of any segments does not match.
        :raises TypeError: If any segment is not AudioSegment instance.
        """
@ -251,7 +250,7 @@ class AudioSegment():

    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
-        
+
        :param filepath: WAV filepath or file object to save the
                         audio segment.
        :type filepath: str|file
@ -297,7 +296,7 @@ class AudioSegment():

    def to_bytes(self, dtype='float32'):
        """Create a byte string containing the audio content.
-        
+
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
@ -309,7 +308,7 @@ class AudioSegment():

    def to(self, dtype='int16'):
        """Create a `dtype` audio content.
-        
+
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
@ -323,8 +322,8 @@ class AudioSegment():
        """Apply gain in decibels to samples.

        Note that this is an in-place transformation.
-        
-        :param gain: Gain in decibels to apply to samples. 
+
+        :param gain: Gain in decibels to apply to samples.
        :type gain: float|1darray
        """
        self._samples *= 10.**(gain / 20.)
@ -333,7 +332,7 @@ class AudioSegment():
        """Change the audio speed by linear interpolation.

        Note that this is an in-place transformation.
-        
+
        :param speed_rate: Rate of speed change:
                           speed_rate > 1.0, speed up the audio;
                           speed_rate = 1.0, unchanged;
@ -355,6 +354,19 @@ class AudioSegment():
        # self._samples = np.interp(new_indices, old_indices, self._samples)

        # sox, slow
+        try:
+            import soxbindings as sox
+        except:
+            try:
+                from paddlespeech.s2t.utils import dynamic_pip_install
+                package = "sox"
+                dynamic_pip_install.install(package)
+                package = "soxbindings"
+                dynamic_pip_install.install(package)
+                import soxbindings as sox
+            except:
+                raise RuntimeError("Can not install soxbindings on your system." )
+
        tfm = sox.Transformer()
        tfm.set_globals(multithread=False)
        tfm.speed(speed_rate)
@ -405,7 +417,7 @@ class AudioSegment():
        :param prior_samples: Prior strength in number of samples.
        :type prior_samples: float
        :param startup_delay: Default 0.0s. If provided, this function will
-                              accrue statistics for the first startup_delay 
+                              accrue statistics for the first startup_delay
                              seconds before applying online normalization.
        :type startup_delay: float
        """
@ -557,7 +569,7 @@ class AudioSegment():
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
        :param allow_resample: Indicates whether resampling is allowed when
-                               the impulse_segment has a different sample 
+                               the impulse_segment has a different sample
                               rate from this signal.
        :type allow_resample: bool
        :raises ValueError: If the sample rate is not match between two
@ -695,7 +707,7 @@ class AudioSegment():

    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
-        
+
        Audio sample type is usually integer or float-point. For integer
        type, float32 will be rescaled from [-1, 1] to the maximum range
        supported by the integer type.
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@ -16,7 +16,6 @@ import librosa
 import numpy
 import scipy
 import soundfile
-import soxbindings as sox

 from paddlespeech.s2t.io.reader import SoundHDF5File

@ -115,10 +114,10 @@ class SpeedPerturbationSox():
    and sox-speed just to resample the input,
    i.e pitch and tempo are changed both.

-    To speed up or slow down the sound of a file, 
-    use speed to modify the pitch and the duration of the file. 
-    This raises the speed and reduces the time. 
-    The default factor is 1.0 which makes no change to the audio. 
+    To speed up or slow down the sound of a file,
+    use speed to modify the pitch and the duration of the file.
+    This raises the speed and reduces the time.
+    The default factor is 1.0 which makes no change to the audio.
    2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher.

    "Why use speed option instead of tempo -s in SoX for speed perturbation"
@ -130,7 +129,7 @@ class SpeedPerturbationSox():
    speed option:
    sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9

-    If we use speed option like above, the pitch of audio also will be changed, 
+    If we use speed option like above, the pitch of audio also will be changed,
    but the tempo option does not change the pitch.
    """

@ -146,6 +145,19 @@ class SpeedPerturbationSox():
        self.keep_length = keep_length
        self.state = numpy.random.RandomState(seed)

+        try:
+            import soxbindings as sox
+        except:
+            try:
+                from paddlespeech.s2t.utils import dynamic_pip_install
+                package = "sox"
+                dynamic_pip_install.install(package)
+                package = "soxbindings"
+                dynamic_pip_install.install(package)
+                import soxbindings as sox
+            except:
+                raise RuntimeError("Can not install soxbindings on your system." )
+
        if utt2ratio is not None:
            self.utt2ratio = {}
            # Use the scheduled ratio for each utterances
@ -168,8 +180,8 @@ class SpeedPerturbationSox():
    def __repr__(self):
        if self.utt2ratio is None:
            return f"""{self.__class__.__name__}(
-                lower={self.lower}, 
-                upper={self.upper}, 
+                lower={self.lower},
+                upper={self.upper},
                keep_length={self.keep_length},
                sample_rate={self.sr})"""

--- a/setup.py
+++ b/setup.py
@ -55,8 +55,6 @@ requirements = {
        "scipy",
        "sentencepiece~=0.1.96",
        "soundfile~=0.10",
-        "sox",
-        "soxbindings",
        "textgrid",
        "timer",
        "tqdm",
@ -74,6 +72,8 @@ requirements = {
        "Pillow",
        "pybind11",
        "snakeviz",
+        "sox",
+        "soxbindings",
        "unidecode",
        "yq",
        "pre-commit",