Merge branch 'develop' into ctc

pull/1099/head
Hui Zhang 3 years ago committed by GitHub
commit 764a5d4271
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -21,7 +21,6 @@ import struct
import numpy as np
import resampy
import soundfile
import soxbindings as sox
from scipy import signal
from .utility import convert_samples_from_float32
@ -98,7 +97,7 @@ class AudioSegment():
:param file: Input audio filepath or file object.
:type file: str|file
:param start: Start time in seconds. If start is negative, it wraps
around from the end. If not provided, this function
around from the end. If not provided, this function
reads from the very beginning.
:type start: float
:param end: End time in seconds. If end is negative, it wraps around
@ -199,7 +198,7 @@ class AudioSegment():
@classmethod
def from_bytes(cls, bytes):
"""Create audio segment from a byte string containing audio samples.
:param bytes: Byte string containing audio samples.
:type bytes: str
:return: Audio segment instance.
@ -217,7 +216,7 @@ class AudioSegment():
:type *segments: tuple of AudioSegment
:return: Audio segment instance as concatenating results.
:rtype: AudioSegment
:raises ValueError: If the number of segments is zero, or if the
:raises ValueError: If the number of segments is zero, or if the
sample_rate of any segments does not match.
:raises TypeError: If any segment is not AudioSegment instance.
"""
@ -251,7 +250,7 @@ class AudioSegment():
def to_wav_file(self, filepath, dtype='float32'):
"""Save audio segment to disk as wav file.
:param filepath: WAV filepath or file object to save the
audio segment.
:type filepath: str|file
@ -297,7 +296,7 @@ class AudioSegment():
def to_bytes(self, dtype='float32'):
"""Create a byte string containing the audio content.
:param dtype: Data type for export samples. Options: 'int16', 'int32',
'float32', 'float64'. Default is 'float32'.
:type dtype: str
@ -309,7 +308,7 @@ class AudioSegment():
def to(self, dtype='int16'):
"""Create a `dtype` audio content.
:param dtype: Data type for export samples. Options: 'int16', 'int32',
'float32', 'float64'. Default is 'float32'.
:type dtype: str
@ -323,8 +322,8 @@ class AudioSegment():
"""Apply gain in decibels to samples.
Note that this is an in-place transformation.
:param gain: Gain in decibels to apply to samples.
:param gain: Gain in decibels to apply to samples.
:type gain: float|1darray
"""
self._samples *= 10.**(gain / 20.)
@ -333,7 +332,7 @@ class AudioSegment():
"""Change the audio speed by linear interpolation.
Note that this is an in-place transformation.
:param speed_rate: Rate of speed change:
speed_rate > 1.0, speed up the audio;
speed_rate = 1.0, unchanged;
@ -355,6 +354,19 @@ class AudioSegment():
# self._samples = np.interp(new_indices, old_indices, self._samples)
# sox, slow
try:
import soxbindings as sox
except:
try:
from paddlespeech.s2t.utils import dynamic_pip_install
package = "sox"
dynamic_pip_install.install(package)
package = "soxbindings"
dynamic_pip_install.install(package)
import soxbindings as sox
except:
raise RuntimeError("Can not install soxbindings on your system." )
tfm = sox.Transformer()
tfm.set_globals(multithread=False)
tfm.speed(speed_rate)
@ -405,7 +417,7 @@ class AudioSegment():
:param prior_samples: Prior strength in number of samples.
:type prior_samples: float
:param startup_delay: Default 0.0s. If provided, this function will
accrue statistics for the first startup_delay
accrue statistics for the first startup_delay
seconds before applying online normalization.
:type startup_delay: float
"""
@ -557,7 +569,7 @@ class AudioSegment():
:param impulse_segment: Impulse response segments.
:type impulse_segment: AudioSegment
:param allow_resample: Indicates whether resampling is allowed when
the impulse_segment has a different sample
the impulse_segment has a different sample
rate from this signal.
:type allow_resample: bool
:raises ValueError: If the sample rate is not match between two
@ -695,7 +707,7 @@ class AudioSegment():
def _convert_samples_from_float32(self, samples, dtype):
"""Convert sample type from float32 to dtype.
Audio sample type is usually integer or float-point. For integer
type, float32 will be rescaled from [-1, 1] to the maximum range
supported by the integer type.

@ -16,7 +16,6 @@ import librosa
import numpy
import scipy
import soundfile
import soxbindings as sox
from paddlespeech.s2t.io.reader import SoundHDF5File
@ -115,10 +114,10 @@ class SpeedPerturbationSox():
and sox-speed just to resample the input,
i.e pitch and tempo are changed both.
To speed up or slow down the sound of a file,
use speed to modify the pitch and the duration of the file.
This raises the speed and reduces the time.
The default factor is 1.0 which makes no change to the audio.
To speed up or slow down the sound of a file,
use speed to modify the pitch and the duration of the file.
This raises the speed and reduces the time.
The default factor is 1.0 which makes no change to the audio.
2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher.
"Why use speed option instead of tempo -s in SoX for speed perturbation"
@ -130,7 +129,7 @@ class SpeedPerturbationSox():
speed option:
sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9
If we use speed option like above, the pitch of audio also will be changed,
If we use speed option like above, the pitch of audio also will be changed,
but the tempo option does not change the pitch.
"""
@ -146,6 +145,19 @@ class SpeedPerturbationSox():
self.keep_length = keep_length
self.state = numpy.random.RandomState(seed)
try:
import soxbindings as sox
except:
try:
from paddlespeech.s2t.utils import dynamic_pip_install
package = "sox"
dynamic_pip_install.install(package)
package = "soxbindings"
dynamic_pip_install.install(package)
import soxbindings as sox
except:
raise RuntimeError("Can not install soxbindings on your system." )
if utt2ratio is not None:
self.utt2ratio = {}
# Use the scheduled ratio for each utterances
@ -168,8 +180,8 @@ class SpeedPerturbationSox():
def __repr__(self):
if self.utt2ratio is None:
return f"""{self.__class__.__name__}(
lower={self.lower},
upper={self.upper},
lower={self.lower},
upper={self.upper},
keep_length={self.keep_length},
sample_rate={self.sr})"""

@ -55,8 +55,6 @@ requirements = {
"scipy",
"sentencepiece~=0.1.96",
"soundfile~=0.10",
"sox",
"soxbindings",
"textgrid",
"timer",
"tqdm",
@ -74,6 +72,8 @@ requirements = {
"Pillow",
"pybind11",
"snakeviz",
"sox",
"soxbindings",
"unidecode",
"yq",
"pre-commit",

Loading…
Cancel
Save