fix optional bind, add sox_effects

3 years ago · 63b4494700
parent c37782c115
commit 63b4494700
5 changed files with 450 additions and 23 deletions
--- a/paddlespeech/audio/backends/sox_io_backend.py
+++ b/paddlespeech/audio/backends/sox_io_backend.py
@ -8,8 +8,7 @@ from paddle import Tensor
 from .common import AudioMetaData

 from paddlespeech.audio._internal import module_utils  as _mod_utils
-from paddlespeech.audio._paddleaudio import get_info_file
-from paddlespeech.audio._paddleaudio import get_info_fileobj
+from paddlespeech.aduio import _paddleaudio as paddleaudio 

 #https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py

@ -43,26 +42,38 @@ _fallback_load_filebj = _fail_load_fileobj

@_mod_utils.requires_sox()
 def load(
-        filepath: Union[str, Path],
-        out: Optional[Tensor]=None,
-        normalization: Union[bool, float, Callable]=True,
-        channels_first: bool=True,
-        num_frames: int=0,
-        offset: int=0,
-        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
-    raise RuntimeError("No audio I/O backend is available.")
+        filepath: str,
+        frame_offset: int = 0,
+        num_frames: int=-1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str]=None, ) -> Tuple[Tensor, int]:
+    ret = paddleaudio.sox_io_load_audio_file(
+        filepath, frame_offset, num_frames, normalize, channels_first, format
+    )
+    if ret is not None:
+        return ret
+    return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
+

@_mod_utils.requires_sox()
 def save(filepath: str, 
-         src: Tensor, 
-         sample_rate: int, 
-         precision: int = 16, 
-         channels_first: bool = True) -> None:
-    raise RuntimeError("No audio I/O backend is available.")
+         frame_offset: int = 0,
+         num_frames: int = -1, 
+         normalize: bool = True,
+         channels_first: bool = True,
+         format: Optional[str] = None) -> Tuple[Tensor, int]:
+    ret = paddleaudio.sox_io_load_audio_file(
+        filepath, frame_offset, num_frames, normalize, channels_first, format
+    )
+    if ret is not None:
+        return ret
+    return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
+

@_mod_utils.requires_sox()
 def info(filepath: str, format: Optional[str]) -> None:
-    sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format)
+    sinfo = paddleaudio.get_info_file(filepath, format)
    if sinfo is not None:
        return AudioMetaData(*sinfo)
    return _fallback_info(filepath, format)
--- a/paddlespeech/audio/sox_effects/init.py
+++ b/paddlespeech/audio/sox_effects/init.py
@ -0,0 +1,25 @@
+from paddlespeech.audio._internal import module_utils as _mod_utils
+
+from .sox_effects import (
+    apply_effects_file,
+    apply_effects_tensor,
+    effect_names,
+    init_sox_effects,
+    shutdown_sox_effects,
+)
+
+
+if _mod_utils.is_sox_available():
+    import atexit
+
+    init_sox_effects()
+    atexit.register(shutdown_sox_effects)
+
+__all__ = [
+    "init_sox_effects",
+    "shutdown_sox_effects",
+    "effect_names",
+    "apply_effects_tensor",
+    "apply_effects_file",
+]
+
--- a/paddlespeech/audio/sox_effects/sox_effects.py
+++ b/paddlespeech/audio/sox_effects/sox_effects.py
@ -0,0 +1,283 @@
+import os
+from typing import List, Optional, Tuple
+
+from paddlespeech.audio._internal import module_utils as _mod_utils
+from paddlespeech.audio.utils.sox_utils import list_effects
+from paddlespeech.audio import _paddleaudio as paddleaudio
+
+#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
+
+@_mod_utils.requires_sox()
+def init_sox_effects():
+    """Initialize resources required to use sox effects.
+
+    Note:
+        You do not need to call this function manually. It is called automatically.
+
+    Once initialized, you do not need to call this function again across the multiple uses of
+    sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
+    Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
+    again will result in error.
+    """
+    paddleaudio.sox_effects_initialize_sox_effects()
+
+
+@_mod_utils.requires_sox()
+def shutdown_sox_effects():
+    """Clean up resources required to use sox effects.
+
+    Note:
+        You do not need to call this function manually. It is called automatically.
+
+    It is safe to call this function multiple times.
+    Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
+    initializing again will result in error.
+    """
+    paddleaudio.sox_effects_shutdown_sox_effects()
+
+
+@_mod_utils.requires_sox()
+def effect_names() -> List[str]:
+    """Gets list of valid sox effect names
+
+    Returns:
+        List[str]: list of available effect names.
+
+    Example
+        >>> paddleaudio.sox_effects.effect_names()
+        ['allpass', 'band', 'bandpass', ... ]
+    """
+    return list(list_effects().keys())
+
+
+@_mod_utils.requires_sox()
+def apply_effects_tensor(
+    tensor: torch.Tensor,
+    sample_rate: int,
+    effects: List[List[str]],
+    channels_first: bool = True,
+) -> Tuple[torch.Tensor, int]:
+    """Apply sox effects to given Tensor
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        This function only works on CPU Tensors.
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` command adds certain effects automatically (such as
+        ``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
+        only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
+        need to give ``rate`` effect with desired sampling rate.).
+
+    Args:
+        tensor (torch.Tensor): Input 2D CPU Tensor.
+        sample_rate (int): Sample rate
+        effects (List[List[str]]): List of effects.
+        channels_first (bool, optional): Indicates if the input Tensor's dimension is
+            `[channels, time]` or `[time, channels]`
+
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        The resulting Tensor has the same ``dtype`` as the input Tensor, and
+        the same channels order. The shape of the Tensor can be different based on the
+        effects applied. Sample rate can also be different based on the effects applied.
+
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Generate pseudo wave:
+        >>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
+        >>> sample_rate = 16000
+        >>> waveform = 2 * torch.rand([2, sample_rate * 1]) - 1
+        >>> waveform.shape
+        torch.Size([2, 16000])
+        >>> waveform
+        tensor([[ 0.3138,  0.7620, -0.9019,  ..., -0.7495, -0.4935,  0.5442],
+                [-0.0832,  0.0061,  0.8233,  ..., -0.5176, -0.9140, -0.2434]])
+        >>>
+        >>> # Apply effects
+        >>> waveform, sample_rate = apply_effects_tensor(
+        ...     wave_form, sample_rate, effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> # The new waveform is sampling rate 8000, 1 second.
+        >>> # normalization and channel order are preserved
+        >>> waveform.shape
+        torch.Size([2, 8000])
+        >>> waveform
+        tensor([[ 0.5054, -0.5518, -0.4800,  ..., -0.0076,  0.0096, -0.0110],
+                [ 0.1331,  0.0436, -0.3783,  ..., -0.0035,  0.0012,  0.0008]])
+        >>> sample_rate
+        8000
+
+    Example - Torchscript-able transform
+        >>>
+        >>> # Use `apply_effects_tensor` in `torch.nn.Module` and dump it to file,
+        >>> # then run sox effect via Torchscript runtime.
+        >>>
+        >>> class SoxEffectTransform(torch.nn.Module):
+        ...     effects: List[List[str]]
+        ...
+        ...     def __init__(self, effects: List[List[str]]):
+        ...         super().__init__()
+        ...         self.effects = effects
+        ...
+        ...     def forward(self, tensor: torch.Tensor, sample_rate: int):
+        ...         return sox_effects.apply_effects_tensor(
+        ...             tensor, sample_rate, self.effects)
+        ...
+        ...
+        >>> # Create transform object
+        >>> effects = [
+        ...     ["lowpass", "-1", "300"],  # apply single-pole lowpass filter
+        ...     ["rate", "8000"],  # change sample rate to 8000
+        ... ]
+        >>> transform = SoxEffectTensorTransform(effects, input_sample_rate)
+        >>>
+        >>> # Dump it to file and load
+        >>> path = 'sox_effect.zip'
+        >>> torch.jit.script(trans).save(path)
+        >>> transform = torch.jit.load(path)
+        >>>
+        >>>> # Run transform
+        >>> waveform, input_sample_rate = paddleaudio.load("input.wav")
+        >>> waveform, sample_rate = transform(waveform, input_sample_rate)
+        >>> assert sample_rate == 8000
+    """
+    return paddleaudio.sox_effects_apply_effects_tensor(tensor, sample_rate, effects, channels_first)
+
+
+@_mod_utils.requires_sox()
+def apply_effects_file(
+    path: str,
+    effects: List[List[str]],
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Apply sox effects to the audio file and load the resulting data as Tensor
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        This function works in the way very similar to ``sox`` command, however there are slight
+        differences. For example, ``sox`` commnad adds certain effects automatically (such as
+        ``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
+        effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
+        effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
+        rate and leave samples untouched.
+
+    Args:
+        path (path-like object or file-like object):
+            Source of audio data. When the function is not compiled by TorchScript,
+            (e.g. ``torch.jit.script``), the following types are accepted:
+
+                  * ``path-like``: file path
+                  * ``file-like``: Object with ``read(size: int) -> bytes`` method,
+                    which returns byte string of at most ``size`` length.
+
+            When the function is compiled by TorchScript, only ``str`` type is allowed.
+
+            Note: This argument is intentionally annotated as ``str`` only for
+            TorchScript compiler compatibility.
+        effects (List[List[str]]): List of effects.
+        normalize (bool, optional):
+            When ``True``, this function always return ``float32``, and sample values are
+            normalized to ``[-1.0, 1.0]``.
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type. This argument has no effect for formats other
+            than integer WAV type.
+        channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Override the format detection with the given format.
+            Providing the argument might help when libsox can not infer the format
+            from header or extension,
+
+    Returns:
+        (Tensor, int): Resulting Tensor and sample rate.
+        If ``normalize=True``, the resulting Tensor is always ``float32`` type.
+        If ``normalize=False`` and the input audio file is of integer WAV file, then the
+        resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
+        If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
+        otherwise `[time, channel]`.
+
+    Example - Basic usage
+        >>>
+        >>> # Defines the effects to apply
+        >>> effects = [
+        ...     ['gain', '-n'],  # normalises to 0dB
+        ...     ['pitch', '5'],  # 5 cent pitch shift
+        ...     ['rate', '8000'],  # resample to 8000 Hz
+        ... ]
+        >>>
+        >>> # Apply effects and load data with channels_first=True
+        >>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
+        >>>
+        >>> # Check the result
+        >>> waveform.shape
+        torch.Size([2, 8000])
+        >>> waveform
+        tensor([[ 5.1151e-03,  1.8073e-02,  2.2188e-02,  ...,  1.0431e-07,
+                 -1.4761e-07,  1.8114e-07],
+                [-2.6924e-03,  2.1860e-03,  1.0650e-02,  ...,  6.4122e-07,
+                 -5.6159e-07,  4.8103e-07]])
+        >>> sample_rate
+        8000
+
+    Example - Apply random speed perturbation to dataset
+        >>>
+        >>> # Load data from file, apply random speed perturbation
+        >>> class RandomPerturbationFile(torch.utils.data.Dataset):
+        ...     \"\"\"Given flist, apply random speed perturbation
+        ...
+        ...     Suppose all the input files are at least one second long.
+        ...     \"\"\"
+        ...     def __init__(self, flist: List[str], sample_rate: int):
+        ...         super().__init__()
+        ...         self.flist = flist
+        ...         self.sample_rate = sample_rate
+        ...
+        ...     def __getitem__(self, index):
+        ...         speed = 0.5 + 1.5 * random.randn()
+        ...         effects = [
+        ...             ['gain', '-n', '-10'],  # apply 10 db attenuation
+        ...             ['remix', '-'],  # merge all the channels
+        ...             ['speed', f'{speed:.5f}'],  # duration is now 0.5 ~ 2.0 seconds.
+        ...             ['rate', f'{self.sample_rate}'],
+        ...             ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
+        ...             ['trim', '0', '2'],  # get the first 2 seconds
+        ...         ]
+        ...         waveform, _ = paddleaudio.sox_effects.apply_effects_file(
+        ...             self.flist[index], effects)
+        ...         return waveform
+        ...
+        ...     def __len__(self):
+        ...         return len(self.flist)
+        ...
+        >>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
+        >>> loader = torch.utils.data.DataLoader(dataset, batch_size=32)
+        >>> for batch in loader:
+        >>>     pass
+    """
+    if not torch.jit.is_scripting():
+        if hasattr(path, "read"):
+            ret = paddleaudio._paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format)
+            if ret is None:
+                raise RuntimeError("Failed to load audio from {}".format(path))
+            return ret
+        path = os.fspath(path)
+    ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
+    if ret is not None:
+        return ret
+    raise RuntimeError("Failed to load audio from {}".format(path))
--- a/paddlespeech/audio/src/pybind/pybind.cpp
+++ b/paddlespeech/audio/src/pybind/pybind.cpp
@ -5,17 +5,23 @@
 #include "paddlespeech/audio/src/pybind/sox/io.h"
 #include "paddlespeech/audio/src/pybind/sox/effects.h"
 #include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h"
+
 #include <pybind11/stl.h>
-#include <pybind11/complex.h>
-#incldue <pybind11/functional.h>
-#include <pybind11/chrono.h>
+#include <pybind11/pybind11.h>
+
+// `tl::optional` 
+namespace pybind11 { namespace detail {
+   template <typename T>
+   struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
+}}

 PYBIND11_MODULE(_paddleaudio, m) {
 #ifdef INCLUDE_SOX
    m.def("get_info_file",
          &paddleaudio::sox_io::get_info_file,
          "Get metadata of audio file.");
-    m.def("get_info_fileobj",
+    // support obj later
+    /*m.def("get_info_fileobj",
          &paddleaudio::sox_io::get_info_fileobj,
          "Get metadata of audio in file object.");
    m.def("load_audio_fileobj",
@ -24,6 +30,7 @@ PYBIND11_MODULE(_paddleaudio, m) {
    m.def("save_audio_fileobj",
          &paddleaudio::sox_io::save_audio_fileobj,
          "Save audio to file obj.");
+          */
    // sox io
     m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
     m.def(
@ -58,9 +65,9 @@ PYBIND11_MODULE(_paddleaudio, m) {
         &paddleaudio::sox_utils::get_buffer_size);

     // effect
-     m.def("apply_effects_fileobj",
-           &paddleaudio::sox_effects::apply_effects_fileobj,
-           "Decode audio data from file-like obj and apply effects.");
+     //m.def("apply_effects_fileobj",
+     //      &paddleaudio::sox_effects::apply_effects_fileobj,
+     //      "Decode audio data from file-like obj and apply effects.");
     m.def("sox_effects_initialize_sox_effects",
       &paddleaudio::sox_effects::initialize_sox_effects);
     m.def(
--- a/paddlespeech/audio/utils/sox_utils.py
+++ b/paddlespeech/audio/utils/sox_utils.py
@ -0,0 +1,101 @@
+from typing import Dict, List
+
+from paddlespeech.audio._internal import module_utils as _mod_utils
+from paddlespeech.audio import _paddleaudio
+
+@_mod_utils.requires_sox()
+def set_seed(seed: int):
+    """Set libsox's PRNG
+
+    Args:
+        seed (int): seed value. valid range is int32.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    _paddleaudio.sox_utils_set_seed(seed)
+
+
+@_mod_utils.requires_sox()
+def set_verbosity(verbosity: int):
+    """Set libsox's verbosity
+
+    Args:
+        verbosity (int): Set verbosity level of libsox.
+
+            * ``1`` failure messages
+            * ``2`` warnings
+            * ``3`` details of processing
+            * ``4``-``6`` increasing levels of debug messages
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+   _paddleaudio.sox_utils_set_verbosity(verbosity)
+
+
+@_mod_utils.requires_sox()
+def set_buffer_size(buffer_size: int):
+    """Set buffer size for sox effect chain
+
+    Args:
+        buffer_size (int): Set the size in bytes of the buffers used for processing audio.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    _paddleaudio.sox_utils_set_buffer_size(buffer_size)
+
+
+@_mod_utils.requires_sox()
+def set_use_threads(use_threads: bool):
+    """Set multithread option for sox effect chain
+
+    Args:
+        use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
+            To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
+
+    See Also:
+        http://sox.sourceforge.net/sox.html
+    """
+    _paddleaudio.sox_utils_set_use_threads(use_threads)
+
+
+@_mod_utils.requires_sox()
+def list_effects() -> Dict[str, str]:
+    """List the available sox effect names
+
+    Returns:
+        Dict[str, str]: Mapping from ``effect name`` to ``usage``
+    """
+    return dict(_paddleaudio.sox_utils_list_effects())
+
+
+@_mod_utils.requires_sox()
+def list_read_formats() -> List[str]:
+    """List the supported audio formats for read
+
+    Returns:
+        List[str]: List of supported audio formats
+    """
+    return _paddleaudio.sox_utils_list_read_formats()
+
+
+@_mod_utils.requires_sox()
+def list_write_formats() -> List[str]:
+    """List the supported audio formats for write
+
+    Returns:
+        List[str]: List of supported audio formats
+    """
+    return _paddleaudio.sox_utils_list_write_formats()
+
+
+@_mod_utils.requires_sox()
+def get_buffer_size() -> int:
+    """Get buffer size for sox effect chain
+
+    Returns:
+        int: size in bytes of buffers used for processing audio.
+    """
+    return _paddleaudio.sox_utils_get_buffer_size()