diff --git a/paddlespeech/audio/backends/sox_io_backend.py b/paddlespeech/audio/backends/sox_io_backend.py index b44ac30f8..750d4de1a 100644 --- a/paddlespeech/audio/backends/sox_io_backend.py +++ b/paddlespeech/audio/backends/sox_io_backend.py @@ -8,8 +8,7 @@ from paddle import Tensor from .common import AudioMetaData from paddlespeech.audio._internal import module_utils as _mod_utils -from paddlespeech.audio._paddleaudio import get_info_file -from paddlespeech.audio._paddleaudio import get_info_fileobj +from paddlespeech.aduio import _paddleaudio as paddleaudio #https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py @@ -43,26 +42,38 @@ _fallback_load_filebj = _fail_load_fileobj @_mod_utils.requires_sox() def load( - filepath: Union[str, Path], - out: Optional[Tensor]=None, - normalization: Union[bool, float, Callable]=True, - channels_first: bool=True, - num_frames: int=0, - offset: int=0, - filetype: Optional[str]=None, ) -> Tuple[Tensor, int]: - raise RuntimeError("No audio I/O backend is available.") + filepath: str, + frame_offset: int = 0, + num_frames: int=-1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str]=None, ) -> Tuple[Tensor, int]: + ret = paddleaudio.sox_io_load_audio_file( + filepath, frame_offset, num_frames, normalize, channels_first, format + ) + if ret is not None: + return ret + return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format) + @_mod_utils.requires_sox() def save(filepath: str, - src: Tensor, - sample_rate: int, - precision: int = 16, - channels_first: bool = True) -> None: - raise RuntimeError("No audio I/O backend is available.") + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None) -> Tuple[Tensor, int]: + ret = paddleaudio.sox_io_load_audio_file( + filepath, frame_offset, num_frames, normalize, channels_first, format + ) + if ret is not None: + return ret + return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format) + @_mod_utils.requires_sox() def info(filepath: str, format: Optional[str]) -> None: - sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format) + sinfo = paddleaudio.get_info_file(filepath, format) if sinfo is not None: return AudioMetaData(*sinfo) return _fallback_info(filepath, format) diff --git a/paddlespeech/audio/sox_effects/__init__.py b/paddlespeech/audio/sox_effects/__init__.py new file mode 100644 index 000000000..d68158776 --- /dev/null +++ b/paddlespeech/audio/sox_effects/__init__.py @@ -0,0 +1,25 @@ +from paddlespeech.audio._internal import module_utils as _mod_utils + +from .sox_effects import ( + apply_effects_file, + apply_effects_tensor, + effect_names, + init_sox_effects, + shutdown_sox_effects, +) + + +if _mod_utils.is_sox_available(): + import atexit + + init_sox_effects() + atexit.register(shutdown_sox_effects) + +__all__ = [ + "init_sox_effects", + "shutdown_sox_effects", + "effect_names", + "apply_effects_tensor", + "apply_effects_file", +] + diff --git a/paddlespeech/audio/sox_effects/sox_effects.py b/paddlespeech/audio/sox_effects/sox_effects.py new file mode 100644 index 000000000..1a3f3af29 --- /dev/null +++ b/paddlespeech/audio/sox_effects/sox_effects.py @@ -0,0 +1,283 @@ +import os +from typing import List, Optional, Tuple + +from paddlespeech.audio._internal import module_utils as _mod_utils +from paddlespeech.audio.utils.sox_utils import list_effects +from paddlespeech.audio import _paddleaudio as paddleaudio + +#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py + +@_mod_utils.requires_sox() +def init_sox_effects(): + """Initialize resources required to use sox effects. + + Note: + You do not need to call this function manually. It is called automatically. + + Once initialized, you do not need to call this function again across the multiple uses of + sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet. + Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing + again will result in error. + """ + paddleaudio.sox_effects_initialize_sox_effects() + + +@_mod_utils.requires_sox() +def shutdown_sox_effects(): + """Clean up resources required to use sox effects. + + Note: + You do not need to call this function manually. It is called automatically. + + It is safe to call this function multiple times. + Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and + initializing again will result in error. + """ + paddleaudio.sox_effects_shutdown_sox_effects() + + +@_mod_utils.requires_sox() +def effect_names() -> List[str]: + """Gets list of valid sox effect names + + Returns: + List[str]: list of available effect names. + + Example + >>> paddleaudio.sox_effects.effect_names() + ['allpass', 'band', 'bandpass', ... ] + """ + return list(list_effects().keys()) + + +@_mod_utils.requires_sox() +def apply_effects_tensor( + tensor: torch.Tensor, + sample_rate: int, + effects: List[List[str]], + channels_first: bool = True, +) -> Tuple[torch.Tensor, int]: + """Apply sox effects to given Tensor + + .. devices:: CPU + + .. properties:: TorchScript + + Note: + This function only works on CPU Tensors. + This function works in the way very similar to ``sox`` command, however there are slight + differences. For example, ``sox`` command adds certain effects automatically (such as + ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does + only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also + need to give ``rate`` effect with desired sampling rate.). + + Args: + tensor (torch.Tensor): Input 2D CPU Tensor. + sample_rate (int): Sample rate + effects (List[List[str]]): List of effects. + channels_first (bool, optional): Indicates if the input Tensor's dimension is + `[channels, time]` or `[time, channels]` + + Returns: + (Tensor, int): Resulting Tensor and sample rate. + The resulting Tensor has the same ``dtype`` as the input Tensor, and + the same channels order. The shape of the Tensor can be different based on the + effects applied. Sample rate can also be different based on the effects applied. + + Example - Basic usage + >>> + >>> # Defines the effects to apply + >>> effects = [ + ... ['gain', '-n'], # normalises to 0dB + ... ['pitch', '5'], # 5 cent pitch shift + ... ['rate', '8000'], # resample to 8000 Hz + ... ] + >>> + >>> # Generate pseudo wave: + >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second + >>> sample_rate = 16000 + >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1 + >>> waveform.shape + torch.Size([2, 16000]) + >>> waveform + tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442], + [-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]]) + >>> + >>> # Apply effects + >>> waveform, sample_rate = apply_effects_tensor( + ... wave_form, sample_rate, effects, channels_first=True) + >>> + >>> # Check the result + >>> # The new waveform is sampling rate 8000, 1 second. + >>> # normalization and channel order are preserved + >>> waveform.shape + torch.Size([2, 8000]) + >>> waveform + tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110], + [ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]]) + >>> sample_rate + 8000 + + Example - Torchscript-able transform + >>> + >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file, + >>> # then run sox effect via Torchscript runtime. + >>> + >>> class SoxEffectTransform(torch.nn.Module): + ... effects: List[List[str]] + ... + ... def __init__(self, effects: List[List[str]]): + ... super().__init__() + ... self.effects = effects + ... + ... def forward(self, tensor: torch.Tensor, sample_rate: int): + ... return sox_effects.apply_effects_tensor( + ... tensor, sample_rate, self.effects) + ... + ... + >>> # Create transform object + >>> effects = [ + ... ["lowpass", "-1", "300"], # apply single-pole lowpass filter + ... ["rate", "8000"], # change sample rate to 8000 + ... ] + >>> transform = SoxEffectTensorTransform(effects, input_sample_rate) + >>> + >>> # Dump it to file and load + >>> path = 'sox_effect.zip' + >>> torch.jit.script(trans).save(path) + >>> transform = torch.jit.load(path) + >>> + >>>> # Run transform + >>> waveform, input_sample_rate = paddleaudio.load("input.wav") + >>> waveform, sample_rate = transform(waveform, input_sample_rate) + >>> assert sample_rate == 8000 + """ + return paddleaudio.sox_effects_apply_effects_tensor(tensor, sample_rate, effects, channels_first) + + +@_mod_utils.requires_sox() +def apply_effects_file( + path: str, + effects: List[List[str]], + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, +) -> Tuple[torch.Tensor, int]: + """Apply sox effects to the audio file and load the resulting data as Tensor + + .. devices:: CPU + + .. properties:: TorchScript + + Note: + This function works in the way very similar to ``sox`` command, however there are slight + differences. For example, ``sox`` commnad adds certain effects automatically (such as + ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given + effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate`` + effect with desired sampling rate, because internally, ``speed`` effects only alter sampling + rate and leave samples untouched. + + Args: + path (path-like object or file-like object): + Source of audio data. When the function is not compiled by TorchScript, + (e.g. ``torch.jit.script``), the following types are accepted: + + * ``path-like``: file path + * ``file-like``: Object with ``read(size: int) -> bytes`` method, + which returns byte string of at most ``size`` length. + + When the function is compiled by TorchScript, only ``str`` type is allowed. + + Note: This argument is intentionally annotated as ``str`` only for + TorchScript compiler compatibility. + effects (List[List[str]]): List of effects. + normalize (bool, optional): + When ``True``, this function always return ``float32``, and sample values are + normalized to ``[-1.0, 1.0]``. + If input file is integer WAV, giving ``False`` will change the resulting Tensor type to + integer type. This argument has no effect for formats other + than integer WAV type. + channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Override the format detection with the given format. + Providing the argument might help when libsox can not infer the format + from header or extension, + + Returns: + (Tensor, int): Resulting Tensor and sample rate. + If ``normalize=True``, the resulting Tensor is always ``float32`` type. + If ``normalize=False`` and the input audio file is of integer WAV file, then the + resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported) + If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`, + otherwise `[time, channel]`. + + Example - Basic usage + >>> + >>> # Defines the effects to apply + >>> effects = [ + ... ['gain', '-n'], # normalises to 0dB + ... ['pitch', '5'], # 5 cent pitch shift + ... ['rate', '8000'], # resample to 8000 Hz + ... ] + >>> + >>> # Apply effects and load data with channels_first=True + >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True) + >>> + >>> # Check the result + >>> waveform.shape + torch.Size([2, 8000]) + >>> waveform + tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07, + -1.4761e-07, 1.8114e-07], + [-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07, + -5.6159e-07, 4.8103e-07]]) + >>> sample_rate + 8000 + + Example - Apply random speed perturbation to dataset + >>> + >>> # Load data from file, apply random speed perturbation + >>> class RandomPerturbationFile(torch.utils.data.Dataset): + ... \"\"\"Given flist, apply random speed perturbation + ... + ... Suppose all the input files are at least one second long. + ... \"\"\" + ... def __init__(self, flist: List[str], sample_rate: int): + ... super().__init__() + ... self.flist = flist + ... self.sample_rate = sample_rate + ... + ... def __getitem__(self, index): + ... speed = 0.5 + 1.5 * random.randn() + ... effects = [ + ... ['gain', '-n', '-10'], # apply 10 db attenuation + ... ['remix', '-'], # merge all the channels + ... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds. + ... ['rate', f'{self.sample_rate}'], + ... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end + ... ['trim', '0', '2'], # get the first 2 seconds + ... ] + ... waveform, _ = paddleaudio.sox_effects.apply_effects_file( + ... self.flist[index], effects) + ... return waveform + ... + ... def __len__(self): + ... return len(self.flist) + ... + >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000) + >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32) + >>> for batch in loader: + >>> pass + """ + if not torch.jit.is_scripting(): + if hasattr(path, "read"): + ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format) + if ret is None: + raise RuntimeError("Failed to load audio from {}".format(path)) + return ret + path = os.fspath(path) + ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format) + if ret is not None: + return ret + raise RuntimeError("Failed to load audio from {}".format(path)) \ No newline at end of file diff --git a/paddlespeech/audio/src/pybind/pybind.cpp b/paddlespeech/audio/src/pybind/pybind.cpp index 9cd12bc9e..776e43a7e 100644 --- a/paddlespeech/audio/src/pybind/pybind.cpp +++ b/paddlespeech/audio/src/pybind/pybind.cpp @@ -5,17 +5,23 @@ #include "paddlespeech/audio/src/pybind/sox/io.h" #include "paddlespeech/audio/src/pybind/sox/effects.h" #include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h" + #include -#include -#incldue -#include +#include + +// `tl::optional` +namespace pybind11 { namespace detail { + template + struct type_caster> : optional_caster> {}; +}} PYBIND11_MODULE(_paddleaudio, m) { #ifdef INCLUDE_SOX m.def("get_info_file", &paddleaudio::sox_io::get_info_file, "Get metadata of audio file."); - m.def("get_info_fileobj", + // support obj later + /*m.def("get_info_fileobj", &paddleaudio::sox_io::get_info_fileobj, "Get metadata of audio in file object."); m.def("load_audio_fileobj", @@ -24,6 +30,7 @@ PYBIND11_MODULE(_paddleaudio, m) { m.def("save_audio_fileobj", &paddleaudio::sox_io::save_audio_fileobj, "Save audio to file obj."); + */ // sox io m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file); m.def( @@ -58,9 +65,9 @@ PYBIND11_MODULE(_paddleaudio, m) { &paddleaudio::sox_utils::get_buffer_size); // effect - m.def("apply_effects_fileobj", - &paddleaudio::sox_effects::apply_effects_fileobj, - "Decode audio data from file-like obj and apply effects."); + //m.def("apply_effects_fileobj", + // &paddleaudio::sox_effects::apply_effects_fileobj, + // "Decode audio data from file-like obj and apply effects."); m.def("sox_effects_initialize_sox_effects", &paddleaudio::sox_effects::initialize_sox_effects); m.def( diff --git a/paddlespeech/audio/utils/sox_utils.py b/paddlespeech/audio/utils/sox_utils.py new file mode 100644 index 000000000..fb19ff316 --- /dev/null +++ b/paddlespeech/audio/utils/sox_utils.py @@ -0,0 +1,101 @@ +from typing import Dict, List + +from paddlespeech.audio._internal import module_utils as _mod_utils +from paddlespeech.audio import _paddleaudio + +@_mod_utils.requires_sox() +def set_seed(seed: int): + """Set libsox's PRNG + + Args: + seed (int): seed value. valid range is int32. + + See Also: + http://sox.sourceforge.net/sox.html + """ + _paddleaudio.sox_utils_set_seed(seed) + + +@_mod_utils.requires_sox() +def set_verbosity(verbosity: int): + """Set libsox's verbosity + + Args: + verbosity (int): Set verbosity level of libsox. + + * ``1`` failure messages + * ``2`` warnings + * ``3`` details of processing + * ``4``-``6`` increasing levels of debug messages + + See Also: + http://sox.sourceforge.net/sox.html + """ + _paddleaudio.sox_utils_set_verbosity(verbosity) + + +@_mod_utils.requires_sox() +def set_buffer_size(buffer_size: int): + """Set buffer size for sox effect chain + + Args: + buffer_size (int): Set the size in bytes of the buffers used for processing audio. + + See Also: + http://sox.sourceforge.net/sox.html + """ + _paddleaudio.sox_utils_set_buffer_size(buffer_size) + + +@_mod_utils.requires_sox() +def set_use_threads(use_threads: bool): + """Set multithread option for sox effect chain + + Args: + use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing. + To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support. + + See Also: + http://sox.sourceforge.net/sox.html + """ + _paddleaudio.sox_utils_set_use_threads(use_threads) + + +@_mod_utils.requires_sox() +def list_effects() -> Dict[str, str]: + """List the available sox effect names + + Returns: + Dict[str, str]: Mapping from ``effect name`` to ``usage`` + """ + return dict(_paddleaudio.sox_utils_list_effects()) + + +@_mod_utils.requires_sox() +def list_read_formats() -> List[str]: + """List the supported audio formats for read + + Returns: + List[str]: List of supported audio formats + """ + return _paddleaudio.sox_utils_list_read_formats() + + +@_mod_utils.requires_sox() +def list_write_formats() -> List[str]: + """List the supported audio formats for write + + Returns: + List[str]: List of supported audio formats + """ + return _paddleaudio.sox_utils_list_write_formats() + + +@_mod_utils.requires_sox() +def get_buffer_size() -> int: + """Get buffer size for sox effect chain + + Returns: + int: size in bytes of buffers used for processing audio. + """ + return _paddleaudio.sox_utils_get_buffer_size()