parent
c37782c115
commit
63b4494700
@ -0,0 +1,25 @@
|
||||
from paddlespeech.audio._internal import module_utils as _mod_utils
|
||||
|
||||
from .sox_effects import (
|
||||
apply_effects_file,
|
||||
apply_effects_tensor,
|
||||
effect_names,
|
||||
init_sox_effects,
|
||||
shutdown_sox_effects,
|
||||
)
|
||||
|
||||
|
||||
if _mod_utils.is_sox_available():
|
||||
import atexit
|
||||
|
||||
init_sox_effects()
|
||||
atexit.register(shutdown_sox_effects)
|
||||
|
||||
__all__ = [
|
||||
"init_sox_effects",
|
||||
"shutdown_sox_effects",
|
||||
"effect_names",
|
||||
"apply_effects_tensor",
|
||||
"apply_effects_file",
|
||||
]
|
||||
|
@ -0,0 +1,283 @@
|
||||
import os
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from paddlespeech.audio._internal import module_utils as _mod_utils
|
||||
from paddlespeech.audio.utils.sox_utils import list_effects
|
||||
from paddlespeech.audio import _paddleaudio as paddleaudio
|
||||
|
||||
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def init_sox_effects():
|
||||
"""Initialize resources required to use sox effects.
|
||||
|
||||
Note:
|
||||
You do not need to call this function manually. It is called automatically.
|
||||
|
||||
Once initialized, you do not need to call this function again across the multiple uses of
|
||||
sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
|
||||
Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
|
||||
again will result in error.
|
||||
"""
|
||||
paddleaudio.sox_effects_initialize_sox_effects()
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def shutdown_sox_effects():
|
||||
"""Clean up resources required to use sox effects.
|
||||
|
||||
Note:
|
||||
You do not need to call this function manually. It is called automatically.
|
||||
|
||||
It is safe to call this function multiple times.
|
||||
Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
|
||||
initializing again will result in error.
|
||||
"""
|
||||
paddleaudio.sox_effects_shutdown_sox_effects()
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def effect_names() -> List[str]:
|
||||
"""Gets list of valid sox effect names
|
||||
|
||||
Returns:
|
||||
List[str]: list of available effect names.
|
||||
|
||||
Example
|
||||
>>> paddleaudio.sox_effects.effect_names()
|
||||
['allpass', 'band', 'bandpass', ... ]
|
||||
"""
|
||||
return list(list_effects().keys())
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def apply_effects_tensor(
|
||||
tensor: torch.Tensor,
|
||||
sample_rate: int,
|
||||
effects: List[List[str]],
|
||||
channels_first: bool = True,
|
||||
) -> Tuple[torch.Tensor, int]:
|
||||
"""Apply sox effects to given Tensor
|
||||
|
||||
.. devices:: CPU
|
||||
|
||||
.. properties:: TorchScript
|
||||
|
||||
Note:
|
||||
This function only works on CPU Tensors.
|
||||
This function works in the way very similar to ``sox`` command, however there are slight
|
||||
differences. For example, ``sox`` command adds certain effects automatically (such as
|
||||
``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
|
||||
only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
|
||||
need to give ``rate`` effect with desired sampling rate.).
|
||||
|
||||
Args:
|
||||
tensor (torch.Tensor): Input 2D CPU Tensor.
|
||||
sample_rate (int): Sample rate
|
||||
effects (List[List[str]]): List of effects.
|
||||
channels_first (bool, optional): Indicates if the input Tensor's dimension is
|
||||
`[channels, time]` or `[time, channels]`
|
||||
|
||||
Returns:
|
||||
(Tensor, int): Resulting Tensor and sample rate.
|
||||
The resulting Tensor has the same ``dtype`` as the input Tensor, and
|
||||
the same channels order. The shape of the Tensor can be different based on the
|
||||
effects applied. Sample rate can also be different based on the effects applied.
|
||||
|
||||
Example - Basic usage
|
||||
>>>
|
||||
>>> # Defines the effects to apply
|
||||
>>> effects = [
|
||||
... ['gain', '-n'], # normalises to 0dB
|
||||
... ['pitch', '5'], # 5 cent pitch shift
|
||||
... ['rate', '8000'], # resample to 8000 Hz
|
||||
... ]
|
||||
>>>
|
||||
>>> # Generate pseudo wave:
|
||||
>>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
|
||||
>>> sample_rate = 16000
|
||||
>>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1
|
||||
>>> waveform.shape
|
||||
torch.Size([2, 16000])
|
||||
>>> waveform
|
||||
tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442],
|
||||
[-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]])
|
||||
>>>
|
||||
>>> # Apply effects
|
||||
>>> waveform, sample_rate = apply_effects_tensor(
|
||||
... wave_form, sample_rate, effects, channels_first=True)
|
||||
>>>
|
||||
>>> # Check the result
|
||||
>>> # The new waveform is sampling rate 8000, 1 second.
|
||||
>>> # normalization and channel order are preserved
|
||||
>>> waveform.shape
|
||||
torch.Size([2, 8000])
|
||||
>>> waveform
|
||||
tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110],
|
||||
[ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]])
|
||||
>>> sample_rate
|
||||
8000
|
||||
|
||||
Example - Torchscript-able transform
|
||||
>>>
|
||||
>>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file,
|
||||
>>> # then run sox effect via Torchscript runtime.
|
||||
>>>
|
||||
>>> class SoxEffectTransform(torch.nn.Module):
|
||||
... effects: List[List[str]]
|
||||
...
|
||||
... def __init__(self, effects: List[List[str]]):
|
||||
... super().__init__()
|
||||
... self.effects = effects
|
||||
...
|
||||
... def forward(self, tensor: torch.Tensor, sample_rate: int):
|
||||
... return sox_effects.apply_effects_tensor(
|
||||
... tensor, sample_rate, self.effects)
|
||||
...
|
||||
...
|
||||
>>> # Create transform object
|
||||
>>> effects = [
|
||||
... ["lowpass", "-1", "300"], # apply single-pole lowpass filter
|
||||
... ["rate", "8000"], # change sample rate to 8000
|
||||
... ]
|
||||
>>> transform = SoxEffectTensorTransform(effects, input_sample_rate)
|
||||
>>>
|
||||
>>> # Dump it to file and load
|
||||
>>> path = 'sox_effect.zip'
|
||||
>>> torch.jit.script(trans).save(path)
|
||||
>>> transform = torch.jit.load(path)
|
||||
>>>
|
||||
>>>> # Run transform
|
||||
>>> waveform, input_sample_rate = paddleaudio.load("input.wav")
|
||||
>>> waveform, sample_rate = transform(waveform, input_sample_rate)
|
||||
>>> assert sample_rate == 8000
|
||||
"""
|
||||
return paddleaudio.sox_effects_apply_effects_tensor(tensor, sample_rate, effects, channels_first)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def apply_effects_file(
|
||||
path: str,
|
||||
effects: List[List[str]],
|
||||
normalize: bool = True,
|
||||
channels_first: bool = True,
|
||||
format: Optional[str] = None,
|
||||
) -> Tuple[torch.Tensor, int]:
|
||||
"""Apply sox effects to the audio file and load the resulting data as Tensor
|
||||
|
||||
.. devices:: CPU
|
||||
|
||||
.. properties:: TorchScript
|
||||
|
||||
Note:
|
||||
This function works in the way very similar to ``sox`` command, however there are slight
|
||||
differences. For example, ``sox`` commnad adds certain effects automatically (such as
|
||||
``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
|
||||
effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
|
||||
effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
|
||||
rate and leave samples untouched.
|
||||
|
||||
Args:
|
||||
path (path-like object or file-like object):
|
||||
Source of audio data. When the function is not compiled by TorchScript,
|
||||
(e.g. ``torch.jit.script``), the following types are accepted:
|
||||
|
||||
* ``path-like``: file path
|
||||
* ``file-like``: Object with ``read(size: int) -> bytes`` method,
|
||||
which returns byte string of at most ``size`` length.
|
||||
|
||||
When the function is compiled by TorchScript, only ``str`` type is allowed.
|
||||
|
||||
Note: This argument is intentionally annotated as ``str`` only for
|
||||
TorchScript compiler compatibility.
|
||||
effects (List[List[str]]): List of effects.
|
||||
normalize (bool, optional):
|
||||
When ``True``, this function always return ``float32``, and sample values are
|
||||
normalized to ``[-1.0, 1.0]``.
|
||||
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
||||
integer type. This argument has no effect for formats other
|
||||
than integer WAV type.
|
||||
channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
|
||||
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
||||
format (str or None, optional):
|
||||
Override the format detection with the given format.
|
||||
Providing the argument might help when libsox can not infer the format
|
||||
from header or extension,
|
||||
|
||||
Returns:
|
||||
(Tensor, int): Resulting Tensor and sample rate.
|
||||
If ``normalize=True``, the resulting Tensor is always ``float32`` type.
|
||||
If ``normalize=False`` and the input audio file is of integer WAV file, then the
|
||||
resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
|
||||
If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
|
||||
otherwise `[time, channel]`.
|
||||
|
||||
Example - Basic usage
|
||||
>>>
|
||||
>>> # Defines the effects to apply
|
||||
>>> effects = [
|
||||
... ['gain', '-n'], # normalises to 0dB
|
||||
... ['pitch', '5'], # 5 cent pitch shift
|
||||
... ['rate', '8000'], # resample to 8000 Hz
|
||||
... ]
|
||||
>>>
|
||||
>>> # Apply effects and load data with channels_first=True
|
||||
>>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
|
||||
>>>
|
||||
>>> # Check the result
|
||||
>>> waveform.shape
|
||||
torch.Size([2, 8000])
|
||||
>>> waveform
|
||||
tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07,
|
||||
-1.4761e-07, 1.8114e-07],
|
||||
[-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07,
|
||||
-5.6159e-07, 4.8103e-07]])
|
||||
>>> sample_rate
|
||||
8000
|
||||
|
||||
Example - Apply random speed perturbation to dataset
|
||||
>>>
|
||||
>>> # Load data from file, apply random speed perturbation
|
||||
>>> class RandomPerturbationFile(torch.utils.data.Dataset):
|
||||
... \"\"\"Given flist, apply random speed perturbation
|
||||
...
|
||||
... Suppose all the input files are at least one second long.
|
||||
... \"\"\"
|
||||
... def __init__(self, flist: List[str], sample_rate: int):
|
||||
... super().__init__()
|
||||
... self.flist = flist
|
||||
... self.sample_rate = sample_rate
|
||||
...
|
||||
... def __getitem__(self, index):
|
||||
... speed = 0.5 + 1.5 * random.randn()
|
||||
... effects = [
|
||||
... ['gain', '-n', '-10'], # apply 10 db attenuation
|
||||
... ['remix', '-'], # merge all the channels
|
||||
... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds.
|
||||
... ['rate', f'{self.sample_rate}'],
|
||||
... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end
|
||||
... ['trim', '0', '2'], # get the first 2 seconds
|
||||
... ]
|
||||
... waveform, _ = paddleaudio.sox_effects.apply_effects_file(
|
||||
... self.flist[index], effects)
|
||||
... return waveform
|
||||
...
|
||||
... def __len__(self):
|
||||
... return len(self.flist)
|
||||
...
|
||||
>>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
|
||||
>>> loader = torch.utils.data.DataLoader(dataset, batch_size=32)
|
||||
>>> for batch in loader:
|
||||
>>> pass
|
||||
"""
|
||||
if not torch.jit.is_scripting():
|
||||
if hasattr(path, "read"):
|
||||
ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format)
|
||||
if ret is None:
|
||||
raise RuntimeError("Failed to load audio from {}".format(path))
|
||||
return ret
|
||||
path = os.fspath(path)
|
||||
ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
|
||||
if ret is not None:
|
||||
return ret
|
||||
raise RuntimeError("Failed to load audio from {}".format(path))
|
@ -0,0 +1,101 @@
|
||||
from typing import Dict, List
|
||||
|
||||
from paddlespeech.audio._internal import module_utils as _mod_utils
|
||||
from paddlespeech.audio import _paddleaudio
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def set_seed(seed: int):
|
||||
"""Set libsox's PRNG
|
||||
|
||||
Args:
|
||||
seed (int): seed value. valid range is int32.
|
||||
|
||||
See Also:
|
||||
http://sox.sourceforge.net/sox.html
|
||||
"""
|
||||
_paddleaudio.sox_utils_set_seed(seed)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def set_verbosity(verbosity: int):
|
||||
"""Set libsox's verbosity
|
||||
|
||||
Args:
|
||||
verbosity (int): Set verbosity level of libsox.
|
||||
|
||||
* ``1`` failure messages
|
||||
* ``2`` warnings
|
||||
* ``3`` details of processing
|
||||
* ``4``-``6`` increasing levels of debug messages
|
||||
|
||||
See Also:
|
||||
http://sox.sourceforge.net/sox.html
|
||||
"""
|
||||
_paddleaudio.sox_utils_set_verbosity(verbosity)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def set_buffer_size(buffer_size: int):
|
||||
"""Set buffer size for sox effect chain
|
||||
|
||||
Args:
|
||||
buffer_size (int): Set the size in bytes of the buffers used for processing audio.
|
||||
|
||||
See Also:
|
||||
http://sox.sourceforge.net/sox.html
|
||||
"""
|
||||
_paddleaudio.sox_utils_set_buffer_size(buffer_size)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def set_use_threads(use_threads: bool):
|
||||
"""Set multithread option for sox effect chain
|
||||
|
||||
Args:
|
||||
use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
|
||||
To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
|
||||
|
||||
See Also:
|
||||
http://sox.sourceforge.net/sox.html
|
||||
"""
|
||||
_paddleaudio.sox_utils_set_use_threads(use_threads)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def list_effects() -> Dict[str, str]:
|
||||
"""List the available sox effect names
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: Mapping from ``effect name`` to ``usage``
|
||||
"""
|
||||
return dict(_paddleaudio.sox_utils_list_effects())
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def list_read_formats() -> List[str]:
|
||||
"""List the supported audio formats for read
|
||||
|
||||
Returns:
|
||||
List[str]: List of supported audio formats
|
||||
"""
|
||||
return _paddleaudio.sox_utils_list_read_formats()
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def list_write_formats() -> List[str]:
|
||||
"""List the supported audio formats for write
|
||||
|
||||
Returns:
|
||||
List[str]: List of supported audio formats
|
||||
"""
|
||||
return _paddleaudio.sox_utils_list_write_formats()
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def get_buffer_size() -> int:
|
||||
"""Get buffer size for sox effect chain
|
||||
|
||||
Returns:
|
||||
int: size in bytes of buffers used for processing audio.
|
||||
"""
|
||||
return _paddleaudio.sox_utils_get_buffer_size()
|
Loading…
Reference in new issue