diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index 65dccad3..d494cc4f 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -21,7 +21,6 @@ import struct import numpy as np import resampy import soundfile -import soxbindings as sox from scipy import signal from .utility import convert_samples_from_float32 @@ -98,7 +97,7 @@ class AudioSegment(): :param file: Input audio filepath or file object. :type file: str|file :param start: Start time in seconds. If start is negative, it wraps - around from the end. If not provided, this function + around from the end. If not provided, this function reads from the very beginning. :type start: float :param end: End time in seconds. If end is negative, it wraps around @@ -199,7 +198,7 @@ class AudioSegment(): @classmethod def from_bytes(cls, bytes): """Create audio segment from a byte string containing audio samples. - + :param bytes: Byte string containing audio samples. :type bytes: str :return: Audio segment instance. @@ -217,7 +216,7 @@ class AudioSegment(): :type *segments: tuple of AudioSegment :return: Audio segment instance as concatenating results. :rtype: AudioSegment - :raises ValueError: If the number of segments is zero, or if the + :raises ValueError: If the number of segments is zero, or if the sample_rate of any segments does not match. :raises TypeError: If any segment is not AudioSegment instance. """ @@ -251,7 +250,7 @@ class AudioSegment(): def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. - + :param filepath: WAV filepath or file object to save the audio segment. :type filepath: str|file @@ -297,7 +296,7 @@ class AudioSegment(): def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. - + :param dtype: Data type for export samples. Options: 'int16', 'int32', 'float32', 'float64'. Default is 'float32'. :type dtype: str @@ -309,7 +308,7 @@ class AudioSegment(): def to(self, dtype='int16'): """Create a `dtype` audio content. - + :param dtype: Data type for export samples. Options: 'int16', 'int32', 'float32', 'float64'. Default is 'float32'. :type dtype: str @@ -323,8 +322,8 @@ class AudioSegment(): """Apply gain in decibels to samples. Note that this is an in-place transformation. - - :param gain: Gain in decibels to apply to samples. + + :param gain: Gain in decibels to apply to samples. :type gain: float|1darray """ self._samples *= 10.**(gain / 20.) @@ -333,7 +332,7 @@ class AudioSegment(): """Change the audio speed by linear interpolation. Note that this is an in-place transformation. - + :param speed_rate: Rate of speed change: speed_rate > 1.0, speed up the audio; speed_rate = 1.0, unchanged; @@ -355,6 +354,19 @@ class AudioSegment(): # self._samples = np.interp(new_indices, old_indices, self._samples) # sox, slow + try: + import soxbindings as sox + except: + try: + from paddlespeech.s2t.utils import dynamic_pip_install + package = "sox" + dynamic_pip_install.install(package) + package = "soxbindings" + dynamic_pip_install.install(package) + import soxbindings as sox + except: + raise RuntimeError("Can not install soxbindings on your system." ) + tfm = sox.Transformer() tfm.set_globals(multithread=False) tfm.speed(speed_rate) @@ -405,7 +417,7 @@ class AudioSegment(): :param prior_samples: Prior strength in number of samples. :type prior_samples: float :param startup_delay: Default 0.0s. If provided, this function will - accrue statistics for the first startup_delay + accrue statistics for the first startup_delay seconds before applying online normalization. :type startup_delay: float """ @@ -557,7 +569,7 @@ class AudioSegment(): :param impulse_segment: Impulse response segments. :type impulse_segment: AudioSegment :param allow_resample: Indicates whether resampling is allowed when - the impulse_segment has a different sample + the impulse_segment has a different sample rate from this signal. :type allow_resample: bool :raises ValueError: If the sample rate is not match between two @@ -695,7 +707,7 @@ class AudioSegment(): def _convert_samples_from_float32(self, samples, dtype): """Convert sample type from float32 to dtype. - + Audio sample type is usually integer or float-point. For integer type, float32 will be rescaled from [-1, 1] to the maximum range supported by the integer type. diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 873adb0b..90144197 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -16,7 +16,6 @@ import librosa import numpy import scipy import soundfile -import soxbindings as sox from paddlespeech.s2t.io.reader import SoundHDF5File @@ -115,10 +114,10 @@ class SpeedPerturbationSox(): and sox-speed just to resample the input, i.e pitch and tempo are changed both. - To speed up or slow down the sound of a file, - use speed to modify the pitch and the duration of the file. - This raises the speed and reduces the time. - The default factor is 1.0 which makes no change to the audio. + To speed up or slow down the sound of a file, + use speed to modify the pitch and the duration of the file. + This raises the speed and reduces the time. + The default factor is 1.0 which makes no change to the audio. 2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher. "Why use speed option instead of tempo -s in SoX for speed perturbation" @@ -130,7 +129,7 @@ class SpeedPerturbationSox(): speed option: sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9 - If we use speed option like above, the pitch of audio also will be changed, + If we use speed option like above, the pitch of audio also will be changed, but the tempo option does not change the pitch. """ @@ -146,6 +145,19 @@ class SpeedPerturbationSox(): self.keep_length = keep_length self.state = numpy.random.RandomState(seed) + try: + import soxbindings as sox + except: + try: + from paddlespeech.s2t.utils import dynamic_pip_install + package = "sox" + dynamic_pip_install.install(package) + package = "soxbindings" + dynamic_pip_install.install(package) + import soxbindings as sox + except: + raise RuntimeError("Can not install soxbindings on your system." ) + if utt2ratio is not None: self.utt2ratio = {} # Use the scheduled ratio for each utterances @@ -168,8 +180,8 @@ class SpeedPerturbationSox(): def __repr__(self): if self.utt2ratio is None: return f"""{self.__class__.__name__}( - lower={self.lower}, - upper={self.upper}, + lower={self.lower}, + upper={self.upper}, keep_length={self.keep_length}, sample_rate={self.sr})""" diff --git a/setup.py b/setup.py index 9aaaa6eb..1ac671f1 100644 --- a/setup.py +++ b/setup.py @@ -55,8 +55,6 @@ requirements = { "scipy", "sentencepiece~=0.1.96", "soundfile~=0.10", - "sox", - "soxbindings", "textgrid", "timer", "tqdm", @@ -74,6 +72,8 @@ requirements = { "Pillow", "pybind11", "snakeviz", + "sox", + "soxbindings", "unidecode", "yq", "pre-commit",