Merge pull request #2420 from SmileGoat/update_audio_api_in_apps

clean code in paddlespeech/audio
pull/2468/head
Hui Zhang 3 years ago committed by GitHub
commit 25f2a2f8e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,3 +0,0 @@
add_subdirectory(third_party)
add_subdirectory(src)

@ -1,31 +0,0 @@
# PaddleAudio
## Reference
`csrc` code is reference of `torchaudio`.
```text
BSD 2-Clause License
Copyright (c) [year], [fullname]
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```

@ -11,17 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import _extension
from . import compliance
from . import datasets
from . import features
from . import functional
from . import io
from . import metric
from . import sox_effects
from . import streamdata
from . import text
from . import transform
from .backends import load
from .backends import save

@ -1,164 +0,0 @@
import os
import warnings
from pathlib import Path
from ._internal import module_utils as _mod_utils # noqa: F401
import contextlib
import ctypes
import os
import sys
import types
# Query `hasattr` only once.
_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
'setdlopenflags')
@contextlib.contextmanager
def dl_open_guard():
"""
# https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
shared library to load custom operators.
"""
if _SET_GLOBAL_FLAGS:
old_flags = sys.getdlopenflags()
sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
yield
if _SET_GLOBAL_FLAGS:
sys.setdlopenflags(old_flags)
def resolve_library_path(path: str) -> str:
return os.path.realpath(path)
class _Ops(types.ModuleType):
#__file__ = '_ops.py'
def __init__(self):
super(_Ops, self).__init__('paddlespeech.ops')
self.loaded_libraries = set()
def load_library(self, path):
"""
Loads a shared library from the given path into the current process.
This allows dynamically loading custom operators. For this,
you should compile your operator and
the static registration code into a shared library object, and then
call ``paddlespeech.ops.load_library('path/to/libcustom.so')`` to load the
shared object.
After the library is loaded, it is added to the
``paddlespeech.ops.loaded_libraries`` attribute, a set that may be inspected
for the paths of all libraries loaded using this function.
Args:
path (str): A path to a shared library to load.
"""
path = resolve_library_path(path)
with dl_open_guard():
# https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
# Import the shared library into the process, thus running its
# static (global) initialization code in order to register custom
# operators with the JIT.
ctypes.CDLL(path)
self.loaded_libraries.add(path)
_LIB_DIR = Path(__file__).parent / "lib"
def _get_lib_path(lib: str):
suffix = "pyd" if os.name == "nt" else "so"
path = _LIB_DIR / f"{lib}.{suffix}"
return path
def _load_lib(lib: str) -> bool:
"""Load extension module
Note:
In case `paddleaudio` is deployed with `pex` format, the library file
is not in a standard location.
In this case, we expect that `libpaddlleaudio` is available somewhere
in the search path of dynamic loading mechanism, so that importing
`_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
This is the reason why the function should not raising an error when the library
file is not found.
Returns:
bool:
True if the library file is found AND the library loaded without failure.
False if the library file is not found (like in the case where paddlleaudio
is deployed with pex format, thus the shared library file is
in a non-standard location.).
If the library file is found but there is an issue loading the library,
(such as missing dependency) then this function raises the exception as-is.
Raises:
Exception:
If the library file is found, but there is an issue loading the library file,
(when underlying `ctype.DLL` throws an exception), this function will pass
the exception as-is, instead of catching it and returning bool.
The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
is not found.
This behavior was chosen because the expected failure case is not recoverable.
If a dependency is missing, then users have to install it.
"""
path = _get_lib_path(lib)
if not path.exists():
warnings.warn("lib path is not exists:" + str(path))
return False
#paddlespeech.audio.ops.load_library(path)
ops.load_library(path)
return True
_FFMPEG_INITIALIZED = False
def _init_ffmpeg():
global _FFMPEG_INITIALIZED
if _FFMPEG_INITIALIZED:
return
if not paddlespeech.audio._paddlleaudio.is_ffmpeg_available():
raise RuntimeError(
"paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
)
try:
_load_lib("libpaddlleaudio_ffmpeg")
except OSError as err:
raise ImportError(
"FFmpeg libraries are not found. Please install FFmpeg.") from err
import paddllespeech.audio._paddlleaudio_ffmpeg # noqa
paddlespeech.audio._paddlleaudio.ffmpeg_init()
if paddlespeech.audio._paddlleaudio.ffmpeg_get_log_level() > 8:
paddlespeech.audio._paddlleaudio.ffmpeg_set_log_level(8)
_FFMPEG_INITIALIZED = True
def _init_extension():
if not _mod_utils.is_module_available("paddlespeech.audio._paddleaudio"):
warnings.warn("paddlespeech C++ extension is not available.")
return
_load_lib("libpaddleaudio")
# This import is for initializing the methods registered via PyBind11
# This has to happen after the base library is loaded
from paddlespeech.audio import _paddleaudio # noqa
# Because this part is executed as part of `import torchaudio`, we ignore the
# initialization failure.
# If the FFmpeg integration is not properly initialized, then detailed error
# will be raised when client code attempts to import the dedicated feature.
try:
_init_ffmpeg()
except Exception:
pass
ops = _Ops()
_init_extension()

@ -1,148 +0,0 @@
import importlib.util
import warnings
from functools import wraps
from typing import Optional
#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py
def is_module_available(*modules: str) -> bool:
r"""Returns if a top-level module with :attr:`name` exists *without**
importing it. This is generally safer than try-catch block around a
`import X`. It avoids third party libraries breaking assumptions of some of
our tests, e.g., setting multiprocessing start method when imported
(see librosa/#747, torchvision/#544).
"""
return all(importlib.util.find_spec(m) is not None for m in modules)
def requires_module(*modules: str):
"""Decorate function to give error message if invoked without required optional modules.
This decorator is to give better error message to users rather
than raising ``NameError: name 'module' is not defined`` at random places.
"""
missing = [m for m in modules if not is_module_available(m)]
if not missing:
# fall through. If all the modules are available, no need to decorate
def decorator(func):
return func
else:
req = f"module: {missing[0]}" if len(
missing) == 1 else f"modules: {missing}"
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires {req}")
return wrapped
return decorator
def deprecated(direction: str, version: Optional[str]=None):
"""Decorator to add deprecation message
Args:
direction (str): Migration steps to be given to users.
version (str or int): The version when the object will be removed
"""
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
message = (
f"{func.__module__}.{func.__name__} has been deprecated "
f'and will be removed from {"future" if version is None else version} release. '
f"{direction}")
warnings.warn(message, stacklevel=2)
return func(*args, **kwargs)
return wrapped
return decorator
def is_kaldi_available():
return is_module_available("paddlespeech.audio._paddleaudio")
def requires_kaldi():
if is_kaldi_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires kaldi")
return wrapped
return decorator
def _check_soundfile_importable():
if not is_module_available("soundfile"):
return False
try:
import soundfile # noqa: F401
return True
except Exception:
warnings.warn(
"Failed to import soundfile. 'soundfile' backend is not available.")
return False
_is_soundfile_importable = _check_soundfile_importable()
def is_soundfile_available():
return _is_soundfile_importable
def requires_soundfile():
if is_soundfile_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires soundfile")
return wrapped
return decorator
def is_sox_available():
return is_module_available("paddlespeech.audio._paddleaudio")
def requires_sox():
if is_sox_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires sox")
return wrapped
return decorator

@ -1,63 +0,0 @@
import contextlib
import ctypes
import os
import sys
import types
# Query `hasattr` only once.
_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
'setdlopenflags')
@contextlib.contextmanager
def dl_open_guard():
"""
# https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
shared library to load custom operators.
"""
if _SET_GLOBAL_FLAGS:
old_flags = sys.getdlopenflags()
sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
yield
if _SET_GLOBAL_FLAGS:
sys.setdlopenflags(old_flags)
def resolve_library_path(path: str) -> str:
return os.path.realpath(path)
class _Ops(types.ModuleType):
__file__ = '_ops.py'
def __init__(self):
super(_Ops, self).__init__('paddlespeech.ops')
self.loaded_libraries = set()
def load_library(self, path):
"""
Loads a shared library from the given path into the current process.
This allows dynamically loading custom operators. For this,
you should compile your operator and
the static registration code into a shared library object, and then
call ``paddlespeech.ops.load_library('path/to/libcustom.so')`` to load the
shared object.
After the library is loaded, it is added to the
``paddlespeech.ops.loaded_libraries`` attribute, a set that may be inspected
for the paths of all libraries loaded using this function.
Args:
path (str): A path to a shared library to load.
"""
path = resolve_library_path(path)
with dl_open_guard():
# https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
# Import the shared library into the process, thus running its
# static (global) initialization code in order to register custom
# operators with the JIT.
ctypes.CDLL(path)
self.loaded_libraries.add(path)
# The ops "namespace"
ops = _Ops()

@ -1,18 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# flake8: noqa
from . import utils
from .utils import get_audio_backend
from .utils import list_audio_backends
from .utils import set_audio_backend

@ -1,55 +0,0 @@
# code from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py
class AudioMetaData:
"""Return type of ``torchaudio.info`` function.
This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
:ref:`"soundfile" backend with the new interface<soundfile_backend>`.
:ivar int sample_rate: Sample rate
:ivar int num_frames: The number of frames
:ivar int num_channels: The number of channels
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
or when it cannot be accurately inferred.
:ivar str encoding: Audio encoding
The values encoding can take are one of the following:
* ``PCM_S``: Signed integer linear PCM
* ``PCM_U``: Unsigned integer linear PCM
* ``PCM_F``: Floating point linear PCM
* ``FLAC``: Flac, Free Lossless Audio Codec
* ``ULAW``: Mu-law
* ``ALAW``: A-law
* ``MP3`` : MP3, MPEG-1 Audio Layer III
* ``VORBIS``: OGG Vorbis
* ``AMR_WB``: Adaptive Multi-Rate
* ``AMR_NB``: Adaptive Multi-Rate Wideband
* ``OPUS``: Opus
* ``HTK``: Single channel 16-bit PCM
* ``UNKNOWN`` : None of above
"""
def __init__(
self,
sample_rate: int,
num_frames: int,
num_channels: int,
bits_per_sample: int,
encoding: str,
):
self.sample_rate = sample_rate
self.num_frames = num_frames
self.num_channels = num_channels
self.bits_per_sample = bits_per_sample
self.encoding = encoding
def __str__(self):
return (
f"AudioMetaData("
f"sample_rate={self.sample_rate}, "
f"num_frames={self.num_frames}, "
f"num_channels={self.num_channels}, "
f"bits_per_sample={self.bits_per_sample}, "
f"encoding={self.encoding}"
f")"
)

@ -1,32 +0,0 @@
from pathlib import Path
from typing import Callable
from typing import Optional
from typing import Tuple
from typing import Union
from paddle import Tensor
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
def load(
filepath: Union[str, Path],
out: Optional[Tensor]=None,
normalization: Union[bool, float, Callable]=True,
channels_first: bool=True,
num_frames: int=0,
offset: int=0,
filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
raise RuntimeError("No audio I/O backend is available.")
def save(filepath: str,
src: Tensor,
sample_rate: int,
precision: int=16,
channels_first: bool=True) -> None:
raise RuntimeError("No audio I/O backend is available.")
def info(filepath: str) -> None:
raise RuntimeError("No audio I/O backend is available.")

@ -1,662 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import warnings
from typing import Optional
from typing import Tuple
import numpy as np
import paddle
import resampy
import soundfile
from scipy.io import wavfile
from ..utils import depth_convert
from ..utils import ParameterError
from .common import AudioMetaData
__all__ = [
'resample',
'to_mono',
'normalize',
'save',
'soundfile_save',
'load',
'soundfile_load',
'info',
'to_mono'
]
NORMALMIZE_TYPES = ['linear', 'gaussian']
MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
EPS = 1e-8
def resample(y: np.ndarray,
src_sr: int,
target_sr: int,
mode: str='kaiser_fast') -> np.ndarray:
"""Audio resampling.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
src_sr (int): Source sample rate.
target_sr (int): Target sample rate.
mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
Returns:
np.ndarray: `y` resampled to `target_sr`
"""
if mode == 'kaiser_best':
warnings.warn(
f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
we recommend the mode kaiser_fast in large scale audio trainning')
if not isinstance(y, np.ndarray):
raise ParameterError(
'Only support numpy np.ndarray, but received y in {type(y)}')
if mode not in RESAMPLE_MODES:
raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
return resampy.resample(y, src_sr, target_sr, filter=mode)
def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
"""Convert sterior audio to mono.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
Returns:
np.ndarray: `y` with mono channel.
"""
if merge_type not in MERGE_TYPES:
raise ParameterError(
f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
)
if y.ndim > 2:
raise ParameterError(
f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}')
if y.ndim == 1: # nothing to merge
return y
if merge_type == 'ch0':
return y[0]
if merge_type == 'ch1':
return y[1]
if merge_type == 'random':
return y[np.random.randint(0, 2)]
# need to do averaging according to dtype
if y.dtype == 'float32':
y_out = (y[0] + y[1]) * 0.5
elif y.dtype == 'int16':
y_out = y.astype('int32')
y_out = (y_out[0] + y_out[1]) // 2
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
np.iinfo(y.dtype).max).astype(y.dtype)
elif y.dtype == 'int8':
y_out = y.astype('int16')
y_out = (y_out[0] + y_out[1]) // 2
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
np.iinfo(y.dtype).max).astype(y.dtype)
else:
raise ParameterError(f'Unsupported dtype: {y.dtype}')
return y_out
def soundfile_load_(file: os.PathLike,
offset: Optional[float]=None,
dtype: str='int16',
duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
"""Load audio using soundfile library. This function load audio file using libsndfile.
Args:
file (os.PathLike): File of waveform.
offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
dtype (str, optional): Data type of waveform. Defaults to 'int16'.
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
Returns:
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
"""
with soundfile.SoundFile(file) as sf_desc:
sr_native = sf_desc.samplerate
if offset:
sf_desc.seek(int(offset * sr_native))
if duration is not None:
frame_duration = int(duration * sr_native)
else:
frame_duration = -1
y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
return y, sf_desc.samplerate
def normalize(y: np.ndarray, norm_type: str='linear',
mul_factor: float=1.0) -> np.ndarray:
"""Normalize an input audio with additional multiplier.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
mul_factor (float, optional): Scaling factor. Defaults to 1.0.
Returns:
np.ndarray: `y` after normalization.
"""
if norm_type == 'linear':
amax = np.max(np.abs(y))
factor = 1.0 / (amax + EPS)
y = y * factor * mul_factor
elif norm_type == 'gaussian':
amean = np.mean(y)
astd = np.std(y)
astd = max(astd, EPS)
y = mul_factor * (y - amean) / astd
else:
raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
return y
def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
"""Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
sr (int): Sample rate.
file (os.PathLike): Path of auido file to save.
"""
if not file.endswith('.wav'):
raise ParameterError(
f'only .wav file supported, but dst file name is: {file}')
if sr <= 0:
raise ParameterError(
f'Sample rate should be larger than 0, recieved sr = {sr}')
if y.dtype not in ['int16', 'int8']:
warnings.warn(
f'input data type is {y.dtype}, will convert data to int16 format before saving'
)
y_out = depth_convert(y, 'int16')
else:
y_out = y
wavfile.write(file, sr, y_out)
def soundfile_load(
file: os.PathLike,
sr: Optional[int]=None,
mono: bool=True,
merge_type: str='average', # ch0,ch1,random,average
normal: bool=True,
norm_type: str='linear',
norm_mul_factor: float=1.0,
offset: float=0.0,
duration: Optional[int]=None,
dtype: str='float32',
resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
"""Load audio file from disk. This function loads audio from disk using using audio beackend.
Args:
file (os.PathLike): Path of auido file to load.
sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
mono (bool, optional): Return waveform with mono channel. Defaults to True.
merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
normal (bool, optional): Waveform normalization. Defaults to True.
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
dtype (str, optional): Data type of waveform. Defaults to 'float32'.
resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
Returns:
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
"""
y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
raise ParameterError(f'audio file {file} looks empty')
if mono:
y = to_mono(y, merge_type)
if sr is not None and sr != r:
y = resample(y, r, sr, mode=resample_mode)
r = sr
if normal:
y = normalize(y, norm_type, norm_mul_factor)
elif dtype in ['int8', 'int16']:
# still need to do normalization, before depth convertion
y = normalize(y, 'linear', 1.0)
y = depth_convert(y, dtype)
return y, r
#the code below is form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py
def _get_subtype_for_wav(dtype: paddle.dtype, encoding: str, bits_per_sample: int):
if not encoding:
if not bits_per_sample:
subtype = {
paddle.uint8: "PCM_U8",
paddle.int16: "PCM_16",
paddle.int32: "PCM_32",
paddle.float32: "FLOAT",
paddle.float64: "DOUBLE",
}.get(dtype)
if not subtype:
raise ValueError(f"Unsupported dtype for wav: {dtype}")
return subtype
if bits_per_sample == 8:
return "PCM_U8"
return f"PCM_{bits_per_sample}"
if encoding == "PCM_S":
if not bits_per_sample:
return "PCM_32"
if bits_per_sample == 8:
raise ValueError("wav does not support 8-bit signed PCM encoding.")
return f"PCM_{bits_per_sample}"
if encoding == "PCM_U":
if bits_per_sample in (None, 8):
return "PCM_U8"
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
if encoding == "PCM_F":
if bits_per_sample in (None, 32):
return "FLOAT"
if bits_per_sample == 64:
return "DOUBLE"
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("wav only supports 8-bit mu-law encoding.")
if encoding == "ALAW":
if bits_per_sample in (None, 8):
return "ALAW"
raise ValueError("wav only supports 8-bit a-law encoding.")
raise ValueError(f"wav does not support {encoding}.")
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
if encoding in (None, "PCM_S"):
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
if encoding in ("PCM_U", "PCM_F"):
raise ValueError(f"sph does not support {encoding} encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("sph only supports 8-bit for mu-law encoding.")
if encoding == "ALAW":
return "ALAW"
raise ValueError(f"sph does not support {encoding}.")
def _get_subtype(dtype: paddle.dtype, format: str, encoding: str, bits_per_sample: int):
if format == "wav":
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
if format == "flac":
if encoding:
raise ValueError("flac does not support encoding.")
if not bits_per_sample:
return "PCM_16"
if bits_per_sample > 24:
raise ValueError("flac does not support bits_per_sample > 24.")
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
if format in ("ogg", "vorbis"):
if encoding or bits_per_sample:
raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
return "VORBIS"
if format == "sph":
return _get_subtype_for_sphere(encoding, bits_per_sample)
if format in ("nis", "nist"):
return "PCM_16"
raise ValueError(f"Unsupported format: {format}")
def save(
filepath: str,
src: paddle.Tensor,
sample_rate: int,
channels_first: bool = True,
compression: Optional[float] = None,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
):
"""Save audio data to file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
Args:
filepath (str or pathlib.Path): Path to audio file.
src (paddle.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float of None, optional): Not used.
It is here only for interface compatibility reson with "sox_io" backend.
format (str or None, optional): Override the audio format.
When ``filepath`` argument is path-like object, audio format is
inferred from file extension. If the file extension is missing or
different, you can specify the correct format with this argument.
When ``filepath`` argument is file-like object,
this argument is required.
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
``"flac"`` and ``"sph"``.
encoding (str or None, optional): Changes the encoding for supported formats.
This argument is effective only for supported formats, sush as
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
- ``"PCM_S"`` (signed integer Linear PCM)
- ``"PCM_U"`` (unsigned integer Linear PCM)
- ``"PCM_F"`` (floating point PCM)
- ``"ULAW"`` (mu-law)
- ``"ALAW"`` (a-law)
bits_per_sample (int or None, optional): Changes the bit depth for the
supported formats.
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
you can change the bit depth.
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
Supported formats/encodings/bit depth/compression are:
``"wav"``
- 32-bit floating-point PCM
- 32-bit signed integer PCM
- 24-bit signed integer PCM
- 16-bit signed integer PCM
- 8-bit unsigned integer PCM
- 8-bit mu-law
- 8-bit a-law
Note:
Default encoding/bit depth is determined by the dtype of
the input Tensor.
``"flac"``
- 8-bit
- 16-bit (default)
- 24-bit
``"ogg"``, ``"vorbis"``
- Doesn't accept changing configuration.
``"sph"``
- 8-bit signed integer PCM
- 16-bit signed integer PCM
- 24-bit signed integer PCM
- 32-bit signed integer PCM (default)
- 8-bit mu-law
- 8-bit a-law
- 16-bit a-law
- 24-bit a-law
- 32-bit a-law
"""
if src.ndim != 2:
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
if compression is not None:
warnings.warn(
'`save` function of "soundfile" backend does not support "compression" parameter. '
"The argument is silently ignored."
)
if hasattr(filepath, "write"):
if format is None:
raise RuntimeError("`format` is required when saving to file object.")
ext = format.lower()
else:
ext = str(filepath).split(".")[-1].lower()
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
raise ValueError("Invalid bits_per_sample.")
if bits_per_sample == 24:
warnings.warn(
"Saving audio with 24 bits per sample might warp samples near -1. "
"Using 16 bits per sample might be able to avoid this."
)
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
# so we extend the extensions manually here
if ext in ["nis", "nist", "sph"] and format is None:
format = "NIST"
if channels_first:
src = src.t()
soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
_SUBTYPE2DTYPE = {
"PCM_S8": "int8",
"PCM_U8": "uint8",
"PCM_16": "int16",
"PCM_32": "int32",
"FLOAT": "float32",
"DOUBLE": "float64",
}
def load(
filepath: str,
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[paddle.Tensor, int]:
"""Load audio data from file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype and the shape of `[channel, time]`.
The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
by providing ``normalize=False``, this function can return integer Tensor, where the samples
are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
``flac`` and ``mp3``.
For these formats, this function always returns ``float32`` Tensor with values normalized to
``[-1.0, 1.0]``.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
Args:
filepath (path-like object or file-like object):
Source of audio data.
frame_offset (int, optional):
Number of frames to skip before start reading data.
num_frames (int, optional):
Maximum number of frames to read. ``-1`` reads all the remaining samples,
starting from ``frame_offset``.
This function may return the less number of frames if there is not enough
frames in the given file.
normalize (bool, optional):
When ``True``, this function always return ``float32``, and sample values are
normalized to ``[-1.0, 1.0]``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
(paddle.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and normalization is off, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
`[channel, time]` else `[time, channel]`.
"""
with soundfile.SoundFile(filepath, "r") as file_:
if file_.format != "WAV" or normalize:
dtype = "float32"
elif file_.subtype not in _SUBTYPE2DTYPE:
raise ValueError(f"Unsupported subtype: {file_.subtype}")
else:
dtype = _SUBTYPE2DTYPE[file_.subtype]
frames = file_._prepare_read(frame_offset, None, num_frames)
waveform = file_.read(frames, dtype, always_2d=True)
sample_rate = file_.samplerate
waveform = paddle.to_tensor(waveform)
if channels_first:
waveform = paddle.transpose(waveform, perm=[1,0])
return waveform, sample_rate
# Mapping from soundfile subtype to number of bits per sample.
# This is mostly heuristical and the value is set to 0 when it is irrelevant
# (lossy formats) or when it can't be inferred.
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
# the default seems to be 8 bits but it can be compressed further to 4 bits.
# The dict is inspired from
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
_SUBTYPE_TO_BITS_PER_SAMPLE = {
"PCM_S8": 8, # Signed 8 bit data
"PCM_16": 16, # Signed 16 bit data
"PCM_24": 24, # Signed 24 bit data
"PCM_32": 32, # Signed 32 bit data
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
"FLOAT": 32, # 32 bit float data
"DOUBLE": 64, # 64 bit float data
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"IMA_ADPCM": 0, # IMA ADPCM.
"MS_ADPCM": 0, # Microsoft ADPCM.
"GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
"G721_32": 0, # 32kbs G721 ADPCM encoding.
"G723_24": 0, # 24kbs G723 ADPCM encoding.
"G723_40": 0, # 40kbs G723 ADPCM encoding.
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
"DPCM_8": 8, # 8 bit differential PCM (XI only)
"DPCM_16": 16, # 16 bit differential PCM (XI only)
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
}
def _get_bit_depth(subtype):
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
warnings.warn(
f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
"attribute will be set to 0. If you are seeing this warning, please "
"report by opening an issue on github (after checking for existing/closed ones). "
"You may otherwise ignore this warning."
)
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
_SUBTYPE_TO_ENCODING = {
"PCM_S8": "PCM_S",
"PCM_16": "PCM_S",
"PCM_24": "PCM_S",
"PCM_32": "PCM_S",
"PCM_U8": "PCM_U",
"FLOAT": "PCM_F",
"DOUBLE": "PCM_F",
"ULAW": "ULAW",
"ALAW": "ALAW",
"VORBIS": "VORBIS",
}
def _get_encoding(format: str, subtype: str):
if format == "FLAC":
return "FLAC"
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
"""Get signal information of an audio file.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
Args:
filepath (path-like object or file-like object):
Source of audio data.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
AudioMetaData: meta data of the given audio.
"""
sinfo = soundfile.info(filepath)
return AudioMetaData(
sinfo.samplerate,
sinfo.frames,
sinfo.channels,
bits_per_sample=_get_bit_depth(sinfo.subtype),
encoding=_get_encoding(sinfo.format, sinfo.subtype),
)

@ -1,101 +0,0 @@
from pathlib import Path
from typing import Callable
from typing import Optional, Tuple, Union
import paddle
from paddle import Tensor
from .common import AudioMetaData
import os
from paddlespeech.audio._internal import module_utils as _mod_utils
from paddlespeech.audio import _paddleaudio as paddleaudio
#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
def _fail_info(filepath: str, format: Optional[str]) -> AudioMetaData:
raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioMetaData:
raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
# Note: need to comply TorchScript syntax -- need annotation and no f-string
def _fail_load(
filepath: str,
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[Tensor, int]:
raise RuntimeError("Failed to load audio from {}".format(filepath))
def _fail_load_fileobj(fileobj, *args, **kwargs):
raise RuntimeError(f"Failed to load audio from {fileobj}")
_fallback_info = _fail_info
_fallback_info_fileobj = _fail_info_fileobj
_fallback_load = _fail_load
_fallback_load_filebj = _fail_load_fileobj
@_mod_utils.requires_sox()
def load(
filepath: str,
frame_offset: int = 0,
num_frames: int=-1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str]=None, ) -> Tuple[Tensor, int]:
if hasattr(filepath, "read"):
ret = paddleaudio.load_audio_fileobj(
filepath, frame_offset, num_frames, normalize, channels_first, format
)
if ret is not None:
audio_tensor = paddle.to_tensor(ret[0])
return (audio_tensor, ret[1])
return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format)
filepath = os.fspath(filepath)
ret = paddleaudio.sox_io_load_audio_file(
filepath, frame_offset, num_frames, normalize, channels_first, format
)
if ret is not None:
audio_tensor = paddle.to_tensor(ret[0])
return (audio_tensor, ret[1])
return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
@_mod_utils.requires_sox()
def save(filepath: str,
src: Tensor,
sample_rate: int,
channels_first: bool = True,
compression: Optional[float] = None,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
):
src_arr = src.numpy()
if hasattr(filepath, "write"):
paddleaudio.save_audio_fileobj(
filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
)
return
filepath = os.fspath(filepath)
paddleaudio.sox_io_save_audio_file(
filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
)
@_mod_utils.requires_sox()
def info(filepath: str, format: Optional[str] = None,) -> AudioMetaData:
if hasattr(filepath, "read"):
sinfo = paddleaudio.get_info_fileobj(filepath, format)
if sinfo is not None:
return AudioMetaData(*sinfo)
return _fallback_info_fileobj(filepath, format)
filepath = os.fspath(filepath)
sinfo = paddleaudio.get_info_file(filepath, format)
if sinfo is not None:
return AudioMetaData(*sinfo)
return _fallback_info(filepath, format)

@ -1,93 +0,0 @@
"""Defines utilities for switching audio backends"""
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
import warnings
from typing import List
from typing import Optional
import paddlespeech.audio
from paddlespeech.audio._internal import module_utils as _mod_utils
from . import no_backend, soundfile_backend, sox_io_backend
__all__ = [
"list_audio_backends",
"get_audio_backend",
"set_audio_backend",
]
def list_audio_backends() -> List[str]:
"""List available backends
Returns:
List[str]: The list of available backends.
"""
backends = []
if _mod_utils.is_module_available("soundfile"):
backends.append("soundfile")
if _mod_utils.is_sox_available():
backends.append("sox_io")
return backends
def set_audio_backend(backend: Optional[str]):
"""Set the backend for I/O operation
Args:
backend (str or None): Name of the backend.
One of ``"sox_io"`` or ``"soundfile"`` based on availability
of the system. If ``None`` is provided the current backend is unassigned.
"""
if backend is not None and backend not in list_audio_backends():
raise RuntimeError(f'Backend "{backend}" is not one of '
f"available backends: {list_audio_backends()}.")
if backend is None:
module = no_backend
elif backend == "sox_io":
module = sox_io_backend
elif backend == "soundfile":
module = soundfile_backend
else:
raise NotImplementedError(f'Unexpected backend "{backend}"')
for func in ["save", "load", "info"]:
setattr(paddlespeech.audio, func, getattr(module, func))
# def _init_audio_backend():
# backends = list_audio_backends()
# if "sox_io" in backends:
# set_audio_backend("sox_io")
# elif "soundfile" in backends:
# set_audio_backend("soundfile")
# else:
# warnings.warn("No audio backend is available.")
# set_audio_backend(None)
def _init_audio_backend():
backends = list_audio_backends()
if "soundfile" in backends:
set_audio_backend("soundfile")
elif "sox_io" in backends:
set_audio_backend("sox_io")
else:
warnings.warn("No audio backend is available.")
set_audio_backend(None)
def get_audio_backend() -> Optional[str]:
"""Get the name of the current backend
Returns:
Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
"""
if paddlespeech.audio.load == no_backend.load:
return None
if paddlespeech.audio.load == sox_io_backend.load:
return "sox_io"
if paddlespeech.audio.load == soundfile_backend.load:
return "soundfile"
raise ValueError("Unknown backend.")

@ -1,15 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import kaldi
from . import librosa

@ -1,638 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from torchaudio(https://github.com/pytorch/audio)
import math
from typing import Tuple
import paddle
from paddle import Tensor
from ..functional import create_dct
from ..functional.window import get_window
__all__ = [
'spectrogram',
'fbank',
'mfcc',
]
# window types
HANNING = 'hann'
HAMMING = 'hamming'
POVEY = 'povey'
RECTANGULAR = 'rect'
BLACKMAN = 'blackman'
def _get_epsilon(dtype):
return paddle.to_tensor(1e-07, dtype=dtype)
def _next_power_of_2(x: int) -> int:
return 1 if x == 0 else 2**(x - 1).bit_length()
def _get_strided(waveform: Tensor,
window_size: int,
window_shift: int,
snip_edges: bool) -> Tensor:
assert waveform.dim() == 1
num_samples = waveform.shape[0]
if snip_edges:
if num_samples < window_size:
return paddle.empty((0, 0), dtype=waveform.dtype)
else:
m = 1 + (num_samples - window_size) // window_shift
else:
reversed_waveform = paddle.flip(waveform, [0])
m = (num_samples + (window_shift // 2)) // window_shift
pad = window_size // 2 - window_shift // 2
pad_right = reversed_waveform
if pad > 0:
pad_left = reversed_waveform[-pad:]
waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
else:
waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
def _feature_window_function(
window_type: str,
window_size: int,
blackman_coeff: float,
dtype: int, ) -> Tensor:
if window_type == HANNING:
return get_window('hann', window_size, fftbins=False, dtype=dtype)
elif window_type == HAMMING:
return get_window('hamming', window_size, fftbins=False, dtype=dtype)
elif window_type == POVEY:
return get_window(
'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
elif window_type == RECTANGULAR:
return paddle.ones([window_size], dtype=dtype)
elif window_type == BLACKMAN:
a = 2 * math.pi / (window_size - 1)
window_function = paddle.arange(window_size, dtype=dtype)
return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
(0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
).astype(dtype)
else:
raise Exception('Invalid window type ' + window_type)
def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
energy_floor: float) -> Tensor:
log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
if energy_floor == 0.0:
return log_energy
return paddle.maximum(
log_energy,
paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
def _get_waveform_and_window_properties(
waveform: Tensor,
channel: int,
sr: int,
frame_shift: float,
frame_length: float,
round_to_power_of_two: bool,
preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
channel = max(channel, 0)
assert channel < waveform.shape[0], (
'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
waveform = waveform[channel, :] # size (n)
window_shift = int(
sr * frame_shift *
0.001) # pass frame_shift and frame_length in milliseconds
window_size = int(sr * frame_length * 0.001)
padded_window_size = _next_power_of_2(
window_size) if round_to_power_of_two else window_size
assert 2 <= window_size <= len(waveform), (
'choose a window size {} that is [2, {}]'.format(window_size,
len(waveform)))
assert 0 < window_shift, '`window_shift` must be greater than 0'
assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
' use `round_to_power_of_two` or change `frame_length`'
assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
assert sr > 0, '`sr` must be greater than zero'
return waveform, window_shift, window_size, padded_window_size
def _get_window(waveform: Tensor,
padded_window_size: int,
window_size: int,
window_shift: int,
window_type: str,
blackman_coeff: float,
snip_edges: bool,
raw_energy: bool,
energy_floor: float,
dither: float,
remove_dc_offset: bool,
preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
dtype = waveform.dtype
epsilon = _get_epsilon(dtype)
# (m, window_size)
strided_input = _get_strided(waveform, window_size, window_shift,
snip_edges)
if dither != 0.0:
x = paddle.maximum(epsilon,
paddle.rand(strided_input.shape, dtype=dtype))
rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
strided_input = strided_input + rand_gauss * dither
if remove_dc_offset:
row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1)
strided_input = strided_input - row_means
if raw_energy:
signal_log_energy = _get_log_energy(strided_input, epsilon,
energy_floor) # (m)
if preemphasis_coefficient != 0.0:
offset_strided_input = paddle.nn.functional.pad(
strided_input.unsqueeze(0), (1, 0),
data_format='NCL',
mode='replicate').squeeze(0) # (m, window_size + 1)
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
-1]
window_function = _feature_window_function(
window_type, window_size, blackman_coeff,
dtype).unsqueeze(0) # (1, window_size)
strided_input = strided_input * window_function # (m, window_size)
# (m, padded_window_size)
if padded_window_size != window_size:
padding_right = padded_window_size - window_size
strided_input = paddle.nn.functional.pad(
strided_input.unsqueeze(0), (0, padding_right),
data_format='NCL',
mode='constant',
value=0).squeeze(0)
if not raw_energy:
signal_log_energy = _get_log_energy(strided_input, epsilon,
energy_floor) # size (m)
return strided_input, signal_log_energy
def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
if subtract_mean:
col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
tensor = tensor - col_means
return tensor
def spectrogram(waveform: Tensor,
blackman_coeff: float=0.42,
channel: int=-1,
dither: float=0.0,
energy_floor: float=1.0,
frame_length: float=25.0,
frame_shift: float=10.0,
preemphasis_coefficient: float=0.97,
raw_energy: bool=True,
remove_dc_offset: bool=True,
round_to_power_of_two: bool=True,
sr: int=16000,
snip_edges: bool=True,
subtract_mean: bool=False,
window_type: str=POVEY) -> Tensor:
"""Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
Args:
waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
depends on frame_length and frame_shift.
"""
dtype = waveform.dtype
epsilon = _get_epsilon(dtype)
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
preemphasis_coefficient)
strided_input, signal_log_energy = _get_window(
waveform, padded_window_size, window_size, window_shift, window_type,
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
remove_dc_offset, preemphasis_coefficient)
# (m, padded_window_size // 2 + 1, 2)
fft = paddle.fft.rfft(strided_input)
power_spectrum = paddle.maximum(
fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1)
power_spectrum[:, 0] = signal_log_energy
power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
return power_spectrum
def _inverse_mel_scale_scalar(mel_freq: float) -> float:
return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
def _mel_scale_scalar(freq: float) -> float:
return 1127.0 * math.log(1.0 + freq / 700.0)
def _mel_scale(freq: Tensor) -> Tensor:
return 1127.0 * (1.0 + freq / 700.0).log()
def _vtln_warp_freq(vtln_low_cutoff: float,
vtln_high_cutoff: float,
low_freq: float,
high_freq: float,
vtln_warp_factor: float,
freq: Tensor) -> Tensor:
assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
scale = 1.0 / vtln_warp_factor
Fl = scale * l
Fh = scale * h
assert l > low_freq and h < high_freq
scale_left = (Fl - low_freq) / (l - low_freq)
scale_right = (high_freq - Fh) / (high_freq - h)
res = paddle.empty_like(freq)
outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
| paddle.greater_than(freq, paddle.to_tensor(high_freq))
before_l = paddle.less_than(freq, paddle.to_tensor(l))
before_h = paddle.less_than(freq, paddle.to_tensor(h))
after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
res[before_h] = scale * freq[before_h]
res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
res[outside_low_high_freq] = freq[outside_low_high_freq]
return res
def _vtln_warp_mel_freq(vtln_low_cutoff: float,
vtln_high_cutoff: float,
low_freq,
high_freq: float,
vtln_warp_factor: float,
mel_freq: Tensor) -> Tensor:
return _mel_scale(
_vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
vtln_warp_factor, _inverse_mel_scale(mel_freq)))
def _get_mel_banks(num_bins: int,
window_length_padded: int,
sample_freq: float,
low_freq: float,
high_freq: float,
vtln_low: float,
vtln_high: float,
vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
assert num_bins > 3, 'Must have at least 3 mel bins'
assert window_length_padded % 2 == 0
num_fft_bins = window_length_padded / 2
nyquist = 0.5 * sample_freq
if high_freq <= 0.0:
high_freq += nyquist
assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
fft_bin_width = sample_freq / window_length_padded
mel_low_freq = _mel_scale_scalar(low_freq)
mel_high_freq = _mel_scale_scalar(high_freq)
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
if vtln_high < 0.0:
vtln_high += nyquist
assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
(0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
('Bad values in options: vtln-low {} and vtln-high {}, versus '
'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
bin = paddle.arange(num_bins).unsqueeze(1)
left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1)
center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1)
right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1)
if vtln_warp_factor != 1.0:
left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
vtln_warp_factor, left_mel)
center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
high_freq, vtln_warp_factor,
center_mel)
right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
high_freq, vtln_warp_factor, right_mel)
center_freqs = _inverse_mel_scale(center_mel) # (num_bins)
# (1, num_fft_bins)
mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
# (num_bins, num_fft_bins)
up_slope = (mel - left_mel) / (center_mel - left_mel)
down_slope = (right_mel - mel) / (right_mel - center_mel)
if vtln_warp_factor == 1.0:
bins = paddle.maximum(
paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
else:
bins = paddle.zeros_like(up_slope)
up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
mel, center_mel)
down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
mel, right_mel)
bins[up_idx] = up_slope[up_idx]
bins[down_idx] = down_slope[down_idx]
return bins, center_freqs
def fbank(waveform: Tensor,
blackman_coeff: float=0.42,
channel: int=-1,
dither: float=0.0,
energy_floor: float=1.0,
frame_length: float=25.0,
frame_shift: float=10.0,
high_freq: float=0.0,
htk_compat: bool=False,
low_freq: float=20.0,
n_mels: int=23,
preemphasis_coefficient: float=0.97,
raw_energy: bool=True,
remove_dc_offset: bool=True,
round_to_power_of_two: bool=True,
sr: int=16000,
snip_edges: bool=True,
subtract_mean: bool=False,
use_energy: bool=False,
use_log_fbank: bool=True,
use_power: bool=True,
vtln_high: float=-500.0,
vtln_low: float=100.0,
vtln_warp: float=1.0,
window_type: str=POVEY) -> Tensor:
"""Compute and return filter banks from a waveform. The output is identical to Kaldi's.
Args:
waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
n_mels (int, optional): Number of output mel bins. Defaults to 23.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor: A filter banks tensor with shape `(m, n_mels)`.
"""
dtype = waveform.dtype
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
preemphasis_coefficient)
strided_input, signal_log_energy = _get_window(
waveform, padded_window_size, window_size, window_shift, window_type,
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
remove_dc_offset, preemphasis_coefficient)
# (m, padded_window_size // 2 + 1)
spectrum = paddle.fft.rfft(strided_input).abs()
if use_power:
spectrum = spectrum.pow(2.)
# (n_mels, padded_window_size // 2)
mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
high_freq, vtln_low, vtln_high, vtln_warp)
mel_energies = mel_energies.astype(dtype)
# (n_mels, padded_window_size // 2 + 1)
mel_energies = paddle.nn.functional.pad(
mel_energies.unsqueeze(0), (0, 1),
data_format='NCL',
mode='constant',
value=0).squeeze(0)
# (m, n_mels)
mel_energies = paddle.mm(spectrum, mel_energies.T)
if use_log_fbank:
mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
if use_energy:
signal_log_energy = signal_log_energy.unsqueeze(1)
if htk_compat:
mel_energies = paddle.concat(
(mel_energies, signal_log_energy), axis=1)
else:
mel_energies = paddle.concat(
(signal_log_energy, mel_energies), axis=1)
# (m, n_mels + 1)
mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
return mel_energies
def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
dct_matrix = create_dct(n_mels, n_mels, 'ortho')
dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc)
return dct_matrix
def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
i = paddle.arange(n_mfcc)
return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
cepstral_lifter)
def mfcc(waveform: Tensor,
blackman_coeff: float=0.42,
cepstral_lifter: float=22.0,
channel: int=-1,
dither: float=0.0,
energy_floor: float=1.0,
frame_length: float=25.0,
frame_shift: float=10.0,
high_freq: float=0.0,
htk_compat: bool=False,
low_freq: float=20.0,
n_mfcc: int=13,
n_mels: int=23,
preemphasis_coefficient: float=0.97,
raw_energy: bool=True,
remove_dc_offset: bool=True,
round_to_power_of_two: bool=True,
sr: int=16000,
snip_edges: bool=True,
subtract_mean: bool=False,
use_energy: bool=False,
vtln_high: float=-500.0,
vtln_low: float=100.0,
vtln_warp: float=1.0,
window_type: str=POVEY) -> Tensor:
"""Compute and return mel frequency cepstral coefficients from a waveform. The output is
identical to Kaldi's.
Args:
waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
n_mels (int, optional): Number of output mel bins. Defaults to 23.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
"""
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
n_mfcc, n_mels)
dtype = waveform.dtype
# (m, n_mels + use_energy)
feature = fbank(
waveform=waveform,
blackman_coeff=blackman_coeff,
channel=channel,
dither=dither,
energy_floor=energy_floor,
frame_length=frame_length,
frame_shift=frame_shift,
high_freq=high_freq,
htk_compat=htk_compat,
low_freq=low_freq,
n_mels=n_mels,
preemphasis_coefficient=preemphasis_coefficient,
raw_energy=raw_energy,
remove_dc_offset=remove_dc_offset,
round_to_power_of_two=round_to_power_of_two,
sr=sr,
snip_edges=snip_edges,
subtract_mean=False,
use_energy=use_energy,
use_log_fbank=True,
use_power=True,
vtln_high=vtln_high,
vtln_low=vtln_low,
vtln_warp=vtln_warp,
window_type=window_type)
if use_energy:
# (m)
signal_log_energy = feature[:, n_mels if htk_compat else 0]
mel_offset = int(not htk_compat)
feature = feature[:, mel_offset:(n_mels + mel_offset)]
# (n_mels, n_mfcc)
dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
# (m, n_mfcc)
feature = feature.matmul(dct_matrix)
if cepstral_lifter != 0.0:
# (1, n_mfcc)
lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
feature *= lifter_coeffs.astype(dtype=dtype)
if use_energy:
feature[:, 0] = signal_log_energy
if htk_compat:
energy = feature[:, 0].unsqueeze(1) # (m, 1)
feature = feature[:, 1:] # (m, n_mfcc - 1)
if not use_energy:
energy *= math.sqrt(2)
feature = paddle.concat((feature, energy), axis=1)
feature = _subtract_column_mean(feature, subtract_mean)
return feature

@ -1,788 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from librosa(https://github.com/librosa/librosa)
import warnings
from typing import List
from typing import Optional
from typing import Union
import numpy as np
import scipy
from numpy.lib.stride_tricks import as_strided
from scipy import signal
from ..utils import depth_convert
from ..utils import ParameterError
__all__ = [
# dsp
'stft',
'mfcc',
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'power_to_db',
'compute_fbank_matrix',
'melspectrogram',
'spectrogram',
'mu_encode',
'mu_decode',
# augmentation
'depth_augment',
'spect_augment',
'random_crop1d',
'random_crop2d',
'adaptive_spect_augment',
]
def _pad_center(data: np.ndarray, size: int, axis: int=-1,
**kwargs) -> np.ndarray:
"""Pad an array to a target length along a target axis.
This differs from `np.pad` by centering the data prior to padding,
analogous to `str.center`
"""
kwargs.setdefault("mode", "constant")
n = data.shape[axis]
lpad = int((size - n) // 2)
lengths = [(0, 0)] * data.ndim
lengths[axis] = (lpad, int(size - n - lpad))
if lpad < 0:
raise ParameterError(("Target size ({size:d}) must be "
"at least input size ({n:d})"))
return np.pad(data, lengths, **kwargs)
def _split_frames(x: np.ndarray,
frame_length: int,
hop_length: int,
axis: int=-1) -> np.ndarray:
"""Slice a data array into (overlapping) frames.
This function is aligned with librosa.frame
"""
if not isinstance(x, np.ndarray):
raise ParameterError(
f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
if x.shape[axis] < frame_length:
raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
f" for frame_length={frame_length:d}")
if hop_length < 1:
raise ParameterError(f"Invalid hop_length: {hop_length:d}")
if axis == -1 and not x.flags["F_CONTIGUOUS"]:
warnings.warn(f"librosa.util.frame called with axis={axis} "
"on a non-contiguous input. This will result in a copy.")
x = np.asfortranarray(x)
elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
warnings.warn(f"librosa.util.frame called with axis={axis} "
"on a non-contiguous input. This will result in a copy.")
x = np.ascontiguousarray(x)
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
strides = np.asarray(x.strides)
new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
if axis == -1:
shape = list(x.shape)[:-1] + [frame_length, n_frames]
strides = list(strides) + [hop_length * new_stride]
elif axis == 0:
shape = [n_frames, frame_length] + list(x.shape)[1:]
strides = [hop_length * new_stride] + list(strides)
else:
raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
return as_strided(x, shape=shape, strides=strides)
def _check_audio(y, mono=True) -> bool:
"""Determine whether a variable contains valid audio data.
The audio y must be a np.ndarray, ether 1-channel or two channel
"""
if not isinstance(y, np.ndarray):
raise ParameterError("Audio data must be of type numpy.ndarray")
if y.ndim > 2:
raise ParameterError(
f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
if mono and y.ndim == 2:
raise ParameterError(
f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
if not np.issubdtype(y.dtype, np.floating):
raise ParameterError("Audio data must be floating-point")
if not np.isfinite(y).all():
raise ParameterError("Audio buffer is not finite everywhere")
return True
def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
htk: bool=False) -> np.ndarray:
"""Convert Hz to Mels.
Args:
frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
np.ndarray: Frequency in mels.
"""
freq = np.asanyarray(frequencies)
if htk:
return 2595.0 * np.log10(1.0 + freq / 700.0)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (freq - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if freq.ndim:
# If we have array data, vectorize
log_t = freq >= min_log_hz
mels[log_t] = min_log_mel + \
np.log(freq[log_t] / min_log_hz) / logstep
elif freq >= min_log_hz:
# If we have scalar data, heck directly
mels = min_log_mel + np.log(freq / min_log_hz) / logstep
return mels
def mel_to_hz(mels: Union[float, List[float], np.ndarray],
htk: int=False) -> np.ndarray:
"""Convert mel bin numbers to frequencies.
Args:
mels (Union[float, List[float], np.ndarray]): Frequency in mels.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
np.ndarray: Frequencies in Hz.
"""
mel_array = np.asanyarray(mels)
if htk:
return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mel_array
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if mel_array.ndim:
# If we have vector data, vectorize
log_t = mel_array >= min_log_mel
freqs[log_t] = min_log_hz * \
np.exp(logstep * (mel_array[log_t] - min_log_mel))
elif mel_array >= min_log_mel:
# If we have scalar data, check directly
freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
return freqs
def mel_frequencies(n_mels: int=128,
fmin: float=0.0,
fmax: float=11025.0,
htk: bool=False) -> np.ndarray:
"""Compute mel frequencies.
Args:
n_mels (int, optional): Number of mel bins. Defaults to 128.
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(fmin, htk=htk)
max_mel = hz_to_mel(fmax, htk=htk)
mels = np.linspace(min_mel, max_mel, n_mels)
return mel_to_hz(mels, htk=htk)
def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
"""Compute fourier frequencies.
Args:
sr (int): Sample rate.
n_fft (int): FFT size.
Returns:
np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
"""
return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
def compute_fbank_matrix(sr: int,
n_fft: int,
n_mels: int=128,
fmin: float=0.0,
fmax: Optional[float]=None,
htk: bool=False,
norm: str="slaney",
dtype: type=np.float32) -> np.ndarray:
"""Compute fbank matrix.
Args:
sr (int): Sample rate.
n_fft (int): FFT size.
n_mels (int, optional): Number of mel bins. Defaults to 128.
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use htk scaling. Defaults to False.
norm (str, optional): Type of normalization. Defaults to "slaney".
dtype (type, optional): Data type. Defaults to np.float32.
Returns:
np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
"""
if norm != "slaney":
raise ParameterError('norm must set to slaney')
if fmax is None:
fmax = float(sr) / 2
# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
if norm == "slaney":
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]
# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
# This means we have an empty channel somewhere
warnings.warn("Empty filters detected in mel frequency basis. "
"Some channels will produce empty responses. "
"Try increasing your sampling rate (and fmax) or "
"reducing n_mels.")
return weights
def stft(x: np.ndarray,
n_fft: int=2048,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str="hann",
center: bool=True,
dtype: type=np.complex64,
pad_mode: str="reflect") -> np.ndarray:
"""Short-time Fourier transform (STFT).
Args:
x (np.ndarray): Input waveform in one dimension.
n_fft (int, optional): FFT size. Defaults to 2048.
hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
win_length (Optional[int], optional): The size of window. Defaults to None.
window (str, optional): A string of window specification. Defaults to "hann".
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
Returns:
np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
"""
_check_audio(x)
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
fft_window = signal.get_window(window, win_length, fftbins=True)
# Pad the window out to n_fft size
fft_window = _pad_center(fft_window, n_fft)
# Reshape so that the window can be broadcast
fft_window = fft_window.reshape((-1, 1))
# Pad the time series so that frames are centered
if center:
if n_fft > x.shape[-1]:
warnings.warn(
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
)
x = np.pad(x, int(n_fft // 2), mode=pad_mode)
elif n_fft > x.shape[-1]:
raise ParameterError(
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
)
# Window the time series.
x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
# Pre-allocate the STFT matrix
stft_matrix = np.empty(
(int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
fft = np.fft # use numpy fft as default
# Constrain STFT block sizes to 256 KB
MAX_MEM_BLOCK = 2**8 * 2**10
# how many columns can we fit within MAX_MEM_BLOCK?
n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
n_columns = max(n_columns, 1)
for bl_s in range(0, stft_matrix.shape[1], n_columns):
bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
stft_matrix[:, bl_s:bl_t] = fft.rfft(
fft_window * x_frames[:, bl_s:bl_t], axis=0)
return stft_matrix
def power_to_db(spect: np.ndarray,
ref: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=80.0) -> np.ndarray:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
Args:
spect (np.ndarray): STFT power spectrogram of an input waveform.
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): Minimum threshold. Defaults to 1e-10.
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
Returns:
np.ndarray: Power spectrogram in db scale.
"""
spect = np.asarray(spect)
if amin <= 0:
raise ParameterError("amin must be strictly positive")
if np.issubdtype(spect.dtype, np.complexfloating):
warnings.warn(
"power_to_db was called on complex input so phase "
"information will be discarded. To suppress this warning, "
"call power_to_db(np.abs(D)**2) instead.")
magnitude = np.abs(spect)
else:
magnitude = spect
if callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
if top_db is not None:
if top_db < 0:
raise ParameterError("top_db must be non-negative")
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
return log_spec
def mfcc(x: np.ndarray,
sr: int=16000,
spect: Optional[np.ndarray]=None,
n_mfcc: int=20,
dct_type: int=2,
norm: str="ortho",
lifter: int=0,
**kwargs) -> np.ndarray:
"""Mel-frequency cepstral coefficients (MFCCs)
Args:
x (np.ndarray): Input waveform in one dimension.
sr (int, optional): Sample rate. Defaults to 16000.
spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
norm (str, optional): Type of normalization. Defaults to "ortho".
lifter (int, optional): Cepstral filtering. Defaults to 0.
Returns:
np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
"""
if spect is None:
spect = melspectrogram(x, sr=sr, **kwargs)
M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
if lifter > 0:
factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
lifter)
return M * factor[:, np.newaxis]
elif lifter == 0:
return M
else:
raise ParameterError(
f"MFCC lifter={lifter} must be a non-negative number")
def melspectrogram(x: np.ndarray,
sr: int=16000,
window_size: int=512,
hop_length: int=320,
n_mels: int=64,
fmin: float=50.0,
fmax: Optional[float]=None,
window: str='hann',
center: bool=True,
pad_mode: str='reflect',
power: float=2.0,
to_db: bool=True,
ref: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None) -> np.ndarray:
"""Compute mel-spectrogram.
Args:
x (np.ndarray): Input waveform in one dimension.
sr (int, optional): Sample rate. Defaults to 16000.
window_size (int, optional): Size of FFT and window length. Defaults to 512.
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
n_mels (int, optional): Number of mel bins. Defaults to 64.
fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
window (str, optional): A string of window specification. Defaults to "hann".
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
to_db (bool, optional): Enable db scale. Defaults to True.
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): Minimum threshold. Defaults to 1e-10.
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
Returns:
np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
"""
_check_audio(x, mono=True)
if len(x) <= 0:
raise ParameterError('The input waveform is empty')
if fmax is None:
fmax = sr // 2
if fmin < 0 or fmin >= fmax:
raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
s = stft(
x,
n_fft=window_size,
hop_length=hop_length,
win_length=window_size,
window=window,
center=center,
pad_mode=pad_mode)
spect_power = np.abs(s)**power
fb_matrix = compute_fbank_matrix(
sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
mel_spect = np.matmul(fb_matrix, spect_power)
if to_db:
return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
else:
return mel_spect
def spectrogram(x: np.ndarray,
sr: int=16000,
window_size: int=512,
hop_length: int=320,
window: str='hann',
center: bool=True,
pad_mode: str='reflect',
power: float=2.0) -> np.ndarray:
"""Compute spectrogram.
Args:
x (np.ndarray): Input waveform in one dimension.
sr (int, optional): Sample rate. Defaults to 16000.
window_size (int, optional): Size of FFT and window length. Defaults to 512.
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
window (str, optional): A string of window specification. Defaults to "hann".
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
Returns:
np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
"""
s = stft(
x,
n_fft=window_size,
hop_length=hop_length,
win_length=window_size,
window=window,
center=center,
pad_mode=pad_mode)
return np.abs(s)**power
def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
"""Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
Args:
x (np.ndarray): The input waveform to encode.
mu (int, optional): The endoceding parameter. Defaults to 255.
quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
Returns:
np.ndarray: The mu-law encoded waveform.
"""
mu = 255
y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
if quantized:
y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1]
return y
def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
"""Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
Args:
y (np.ndarray): The encoded waveform.
mu (int, optional): The endoceding parameter. Defaults to 255.
quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
Returns:
np.ndarray: The mu-law decoded waveform.
"""
if mu < 1:
raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
mu = mu - 1
if quantized: # undo the quantization
y = y * 2 / mu - 1
x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
return x
def _randint(high: int) -> int:
"""Generate one random integer in range [0 high)
This is a helper function for random data augmentaiton
"""
return int(np.random.randint(0, high=high))
def depth_augment(y: np.ndarray,
choices: List=['int8', 'int16'],
probs: List[float]=[0.5, 0.5]) -> np.ndarray:
""" Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
Returns:
np.ndarray: The augmented waveform.
"""
assert len(probs) == len(
choices
), 'number of choices {} must be equal to size of probs {}'.format(
len(choices), len(probs))
depth = np.random.choice(choices, p=probs)
src_depth = y.dtype
y1 = depth_convert(y, depth)
y2 = depth_convert(y1, src_depth)
return y2
def adaptive_spect_augment(spect: np.ndarray,
tempo_axis: int=0,
level: float=0.1) -> np.ndarray:
"""Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
Args:
spect (np.ndarray): Input spectrogram.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
level (float, optional): The level factor of masking. Defaults to 0.1.
Returns:
np.ndarray: The augmented spectrogram.
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
time_mask_width = int(nt * level * 0.5)
freq_mask_width = int(nf * level * 0.5)
num_time_mask = int(10 * level)
num_freq_mask = int(10 * level)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def spect_augment(spect: np.ndarray,
tempo_axis: int=0,
max_time_mask: int=3,
max_freq_mask: int=3,
max_time_mask_width: int=30,
max_freq_mask_width: int=20) -> np.ndarray:
"""Do spectrogram augmentation in both time and freq axis.
Args:
spect (np.ndarray): Input spectrogram.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
Returns:
np.ndarray: The augmented spectrogram.
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
num_time_mask = _randint(max_time_mask)
num_freq_mask = _randint(max_freq_mask)
time_mask_width = _randint(max_time_mask_width)
freq_mask_width = _randint(max_freq_mask_width)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
""" Random cropping on a input waveform.
Args:
y (np.ndarray): Input waveform array in 1D.
crop_len (int): Length of waveform to crop.
Returns:
np.ndarray: The cropped waveform.
"""
if y.ndim != 1:
'only accept 1d tensor or numpy array'
n = len(y)
idx = _randint(n - crop_len)
return y[idx:idx + crop_len]
def random_crop2d(s: np.ndarray, crop_len: int,
tempo_axis: int=0) -> np.ndarray:
""" Random cropping on a spectrogram.
Args:
s (np.ndarray): Input spectrogram in 2D.
crop_len (int): Length of spectrogram to crop.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
Returns:
np.ndarray: The cropped spectrogram.
"""
if tempo_axis >= s.ndim:
raise ParameterError('axis out of range')
n = s.shape[tempo_axis]
idx = _randint(high=n - crop_len)
sli = [slice(None) for i in range(s.ndim)]
sli[tempo_axis] = slice(idx, idx + crop_len)
out = s[tuple(sli)]
return out

@ -1,20 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .esc50 import ESC50
from .gtzan import GTZAN
from .hey_snips import HeySnips
from .rirs_noises import OpenRIRNoise
from .tess import TESS
from .urban_sound import UrbanSound8K
from .voxceleb import VoxCeleb

@ -1,100 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np
import paddle
from ..compliance.kaldi import fbank as kaldi_fbank
from ..compliance.kaldi import mfcc as kaldi_mfcc
from ..compliance.librosa import melspectrogram
from ..compliance.librosa import mfcc
feat_funcs = {
'raw': None,
'melspectrogram': melspectrogram,
'mfcc': mfcc,
'kaldi_fbank': kaldi_fbank,
'kaldi_mfcc': kaldi_mfcc,
}
class AudioClassificationDataset(paddle.io.Dataset):
"""
Base class of audio classification dataset.
"""
def __init__(self,
files: List[str],
labels: List[int],
feat_type: str='raw',
sample_rate: int=None,
**kwargs):
"""
Ags:
files (:obj:`List[str]`): A list of absolute path of audio files.
labels (:obj:`List[int]`): Labels of audio files.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
super(AudioClassificationDataset, self).__init__()
if feat_type not in feat_funcs.keys():
raise RuntimeError(
f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
)
self.files = files
self.labels = labels
self.feat_type = feat_type
self.sample_rate = sample_rate
self.feat_config = kwargs # Pass keyword arguments to customize feature config
def _get_data(self, input_file: str):
raise NotImplementedError
def _convert_to_record(self, idx):
file, label = self.files[idx], self.labels[idx]
if self.sample_rate is None:
waveform, sample_rate = paddlespeech.audio.load(file)
else:
waveform, sample_rate = paddlespeech.audio.load(
file, sr=self.sample_rate)
feat_func = feat_funcs[self.feat_type]
record = {}
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T)
record['feat'] = feat_func(
waveform=waveform, sr=self.sample_rate, **self.feat_config)
else:
record['feat'] = feat_func(
waveform, sample_rate,
**self.feat_config) if feat_func else waveform
record['label'] = label
return record
def __getitem__(self, idx):
record = self._convert_to_record(idx)
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
return self.keys[idx], record['feat'], record['label']
else:
return np.array(record['feat']).transpose(), np.array(
record['label'], dtype=np.int64)
def __len__(self):
return len(self.files)

@ -1,152 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List
from typing import Tuple
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from .dataset import AudioClassificationDataset
__all__ = ['ESC50']
class ESC50(AudioClassificationDataset):
"""
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
suitable for benchmarking methods of environmental sound classification. The dataset
consists of 5-second-long recordings organized into 50 semantical classes (with
40 examples per class)
Reference:
ESC: Dataset for Environmental Sound Classification
http://dx.doi.org/10.1145/2733373.2806390
"""
archieves = [
{
'url':
'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
'md5': '7771e4b9d86d0945acce719c7a59305a',
},
]
label_list = [
# Animals
'Dog',
'Rooster',
'Pig',
'Cow',
'Frog',
'Cat',
'Hen',
'Insects (flying)',
'Sheep',
'Crow',
# Natural soundscapes & water sounds
'Rain',
'Sea waves',
'Crackling fire',
'Crickets',
'Chirping birds',
'Water drops',
'Wind',
'Pouring water',
'Toilet flush',
'Thunderstorm',
# Human, non-speech sounds
'Crying baby',
'Sneezing',
'Clapping',
'Breathing',
'Coughing',
'Footsteps',
'Laughing',
'Brushing teeth',
'Snoring',
'Drinking, sipping',
# Interior/domestic sounds
'Door knock',
'Mouse click',
'Keyboard typing',
'Door, wood creaks',
'Can opening',
'Washing machine',
'Vacuum cleaner',
'Clock alarm',
'Clock tick',
'Glass breaking',
# Exterior/urban noises
'Helicopter',
'Chainsaw',
'Siren',
'Car horn',
'Engine',
'Train',
'Church bells',
'Airplane',
'Fireworks',
'Hand saw',
]
meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
meta_info = collections.namedtuple(
'META_INFO',
('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
audio_path = os.path.join('ESC-50-master', 'audio')
def __init__(self,
mode: str='train',
split: int=1,
feat_type: str='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
files, labels = self._get_data(mode, split)
super(ESC50, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines()[1:]:
ret.append(self.meta_info(*line.strip().split(',')))
return ret
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info()
files = []
labels = []
for sample in meta_info:
filename, fold, target, _, _, _, _ = sample
if mode == 'train' and int(fold) != split:
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
if mode != 'train' and int(fold) == split:
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
return files, labels

@ -1,115 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from .dataset import AudioClassificationDataset
__all__ = ['GTZAN']
class GTZAN(AudioClassificationDataset):
"""
The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
in machine listening research for music genre recognition (MGR).
Reference:
Musical genre classification of audio signals
https://ieeexplore.ieee.org/document/1021072/
"""
archieves = [
{
'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
},
]
label_list = [
'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
'pop', 'reggae', 'rock'
]
meta = os.path.join('genres', 'input.mf')
meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
audio_path = 'genres'
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
seed (:obj:`int`, `optional`, defaults to 0):
Set the random seed to shuffle samples.
n_folds (:obj:`int`, `optional`, defaults to 5):
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(GTZAN, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines():
ret.append(self.meta_info(*line.strip().split('\t')))
return ret
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info()
random.seed(seed) # shuffle samples to split data
random.shuffle(
meta_info
) # make sure using the same seed to create train and dev dataset
files = []
labels = []
n_samples_per_fold = len(meta_info) // n_folds
for idx, sample in enumerate(meta_info):
file_path, label = sample
filename = os.path.basename(file_path)
target = self.label_list.index(label)
fold = idx // n_samples_per_fold + 1
if mode == 'train' and int(fold) != split:
files.append(
os.path.join(DATA_HOME, self.audio_path, label, filename))
labels.append(target)
if mode != 'train' and int(fold) == split:
files.append(
os.path.join(DATA_HOME, self.audio_path, label, filename))
labels.append(target)
return files, labels

@ -1,74 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import json
import os
from typing import List
from typing import Tuple
from .dataset import AudioClassificationDataset
__all__ = ['HeySnips']
class HeySnips(AudioClassificationDataset):
meta_info = collections.namedtuple('META_INFO',
('key', 'label', 'duration', 'wav'))
def __init__(self,
data_dir: os.PathLike,
mode: str='train',
feat_type: str='kaldi_fbank',
sample_rate: int=16000,
**kwargs):
self.data_dir = data_dir
files, labels = self._get_data(mode)
super(HeySnips, self).__init__(
files=files,
labels=labels,
feat_type=feat_type,
sample_rate=sample_rate,
**kwargs)
def _get_meta_info(self, mode) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(self.data_dir, '{}.json'.format(mode)),
'r') as f:
data = json.load(f)
for item in data:
sample = collections.OrderedDict()
if item['duration'] > 0:
sample['key'] = item['id']
sample['label'] = 0 if item['is_hotword'] == 1 else -1
sample['duration'] = item['duration']
sample['wav'] = os.path.join(self.data_dir,
item['audio_file_path'])
ret.append(self.meta_info(*sample.values()))
return ret
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
meta_info = self._get_meta_info(mode)
files = []
labels = []
self.keys = []
self.durations = []
for sample in meta_info:
key, target, duration, wav = sample
files.append(wav)
labels.append(int(target))
self.keys.append(key)
self.durations.append(float(duration))
return files, labels

@ -1,200 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import csv
import os
import random
from typing import List
from paddle.io import Dataset
from tqdm import tqdm
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from .dataset import feat_funcs
__all__ = ['OpenRIRNoise']
class OpenRIRNoise(Dataset):
archieves = [
{
'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
'md5': 'e6f48e257286e05de56413b4779d8ffb',
},
]
sample_rate = 16000
meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
base_path = os.path.join(DATA_HOME, 'open_rir_noise')
wav_path = os.path.join(base_path, 'RIRS_NOISES')
csv_path = os.path.join(base_path, 'csv')
subsets = ['rir', 'noise']
def __init__(self,
subset: str='rir',
feat_type: str='raw',
target_dir=None,
random_chunk: bool=True,
chunk_duration: float=3.0,
seed: int=0,
**kwargs):
assert subset in self.subsets, \
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
self.subset = subset
self.feat_type = feat_type
self.feat_config = kwargs
self.random_chunk = random_chunk
self.chunk_duration = chunk_duration
OpenRIRNoise.csv_path = os.path.join(
target_dir, "open_rir_noise",
"csv") if target_dir else self.csv_path
self._data = self._get_data()
super(OpenRIRNoise, self).__init__()
# Set up a seed to reproduce training or predicting result.
# random.seed(seed)
def _get_data(self):
# Download audio files.
print(f"rirs noises base path: {self.base_path}")
if not os.path.isdir(self.base_path):
download_and_decompress(
self.archieves, self.base_path, decompress=True)
else:
print(
f"{self.base_path} already exists, we will not download and decompress again"
)
# Data preparation.
print(f"prepare the csv to {self.csv_path}")
if not os.path.isdir(self.csv_path):
os.makedirs(self.csv_path)
self.prepare_data()
data = []
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
for line in rf.readlines()[1:]:
audio_id, duration, wav = line.strip().split(',')
data.append(self.meta_info(audio_id, float(duration), wav))
random.shuffle(data)
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = paddlespeech.audio.load(record['wav'])
assert self.feat_type in feat_funcs.keys(), \
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
feat_func = feat_funcs[self.feat_type]
feat = feat_func(
waveform, sr=sr, **self.feat_config) if feat_func else waveform
record.update({'feat': feat})
return record
@staticmethod
def _get_chunks(seg_dur, audio_id, audio_duration):
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
chunk_lst = [
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
for i in range(num_chunks)
]
return chunk_lst
def _get_audio_info(self, wav_file: str,
split_chunks: bool) -> List[List[str]]:
waveform, sr = paddlespeech.audio.load(wav_file)
audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
audio_duration = waveform.shape[0] / sr
ret = []
if split_chunks and audio_duration > self.chunk_duration: # Split into pieces of self.chunk_duration seconds.
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
audio_duration)
for idx, chunk in enumerate(uniq_chunks_list):
s, e = chunk.split("_")[-2:] # Timestamps of start and end
start_sample = int(float(s) * sr)
end_sample = int(float(e) * sr)
new_wav_file = os.path.join(self.base_path,
audio_id + f'_chunk_{idx+1:02}.wav')
paddlespeech.audio.save(waveform[start_sample:end_sample], sr,
new_wav_file)
# id, duration, new_wav
ret.append([chunk, self.chunk_duration, new_wav_file])
else: # Keep whole audio.
ret.append([audio_id, audio_duration, wav_file])
return ret
def generate_csv(self,
wav_files: List[str],
output_file: str,
split_chunks: bool=True):
print(f'Generating csv: {output_file}')
header = ["id", "duration", "wav"]
infos = list(
tqdm(
map(self._get_audio_info, wav_files, [split_chunks] * len(
wav_files)),
total=len(wav_files)))
csv_lines = []
for info in infos:
csv_lines.extend(info)
with open(output_file, mode="w") as csv_f:
csv_writer = csv.writer(
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(header)
for line in csv_lines:
csv_writer.writerow(line)
def prepare_data(self):
rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
"rir_list")
rir_files = []
with open(rir_list, 'r') as f:
for line in f.readlines():
rir_file = line.strip().split(' ')[-1]
rir_files.append(os.path.join(self.base_path, rir_file))
noise_list = os.path.join(self.wav_path, "pointsource_noises",
"noise_list")
noise_files = []
with open(noise_list, 'r') as f:
for line in f.readlines():
noise_file = line.strip().split(' ')[-1]
noise_files.append(os.path.join(self.base_path, noise_file))
self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
def __getitem__(self, idx):
return self._convert_to_record(idx)
def __len__(self):
return len(self._data)

@ -1,126 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from .dataset import AudioClassificationDataset
__all__ = ['TESS']
class TESS(AudioClassificationDataset):
"""
TESS is a set of 200 target words were spoken in the carrier phrase
"Say the word _____' by two actresses (aged 26 and 64 years) and
recordings were made of the set portraying each of seven emotions(anger,
disgust, fear, happiness, pleasant surprise, sadness, and neutral).
There are 2800 stimuli in total.
Reference:
Toronto emotional speech set (TESS)
https://doi.org/10.5683/SP2/E8H2MF
"""
archieves = [
{
'url':
'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
'md5':
'1465311b24d1de704c4c63e4ccc470c7',
},
]
label_list = [
'angry',
'disgust',
'fear',
'happy',
'neutral',
'ps', # pleasant surprise
'sad',
]
meta_info = collections.namedtuple('META_INFO',
('speaker', 'word', 'emotion'))
audio_path = 'TESS_Toronto_emotional_speech_set'
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
seed (:obj:`int`, `optional`, defaults to 0):
Set the random seed to shuffle samples.
n_folds (:obj:`int`, `optional`, defaults to 5):
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(TESS, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, files) -> List[collections.namedtuple]:
ret = []
for file in files:
basename_without_extend = os.path.basename(file)[:-4]
ret.append(self.meta_info(*basename_without_extend.split('_')))
return ret
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
download_and_decompress(self.archieves, DATA_HOME)
wav_files = []
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
for file in files:
if file.endswith('.wav'):
wav_files.append(os.path.join(root, file))
random.seed(seed) # shuffle samples to split data
random.shuffle(
wav_files
) # make sure using the same seed to create train and dev dataset
meta_info = self._get_meta_info(wav_files)
files = []
labels = []
n_samples_per_fold = len(meta_info) // n_folds
for idx, sample in enumerate(meta_info):
_, _, emotion = sample
target = self.label_list.index(emotion)
fold = idx // n_samples_per_fold + 1
if mode == 'train' and int(fold) != split:
files.append(wav_files[idx])
labels.append(target)
if mode != 'train' and int(fold) == split:
files.append(wav_files[idx])
labels.append(target)
return files, labels

@ -1,104 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List
from typing import Tuple
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from .dataset import AudioClassificationDataset
__all__ = ['UrbanSound8K']
class UrbanSound8K(AudioClassificationDataset):
"""
UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
classes are drawn from the urban sound taxonomy.
Reference:
A Dataset and Taxonomy for Urban Sound Research
https://dl.acm.org/doi/10.1145/2647868.2655045
"""
archieves = [
{
'url':
'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
'md5': '9aa69802bbf37fb986f71ec1483a196e',
},
]
label_list = [
"air_conditioner", "car_horn", "children_playing", "dog_bark",
"drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
"street_music"
]
meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
meta_info = collections.namedtuple(
'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
'class_id', 'label'))
audio_path = os.path.join('UrbanSound8K', 'audio')
def __init__(self,
mode: str='train',
split: int=1,
feat_type: str='raw',
**kwargs):
files, labels = self._get_data(mode, split)
super(UrbanSound8K, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
def _get_meta_info(self):
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines()[1:]:
ret.append(self.meta_info(*line.strip().split(',')))
return ret
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info()
files = []
labels = []
for sample in meta_info:
filename, _, _, _, _, fold, target, _ = sample
if mode == 'train' and int(fold) != split:
files.append(
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
filename))
labels.append(int(target))
if mode != 'train' and int(fold) == split:
files.append(
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
filename))
labels.append(int(target))
return files, labels

@ -1,355 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import csv
import glob
import os
import random
from multiprocessing import cpu_count
from typing import List
from paddle.io import Dataset
from pathos.multiprocessing import Pool
from tqdm import tqdm
from ..utils import DATA_HOME
from ..utils import decompress
from ..utils.download import download_and_decompress
from .dataset import feat_funcs
__all__ = ['VoxCeleb']
class VoxCeleb(Dataset):
source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
archieves_audio_dev = [
{
'url': source_url + 'vox1_dev_wav_partaa',
'md5': 'e395d020928bc15670b570a21695ed96',
},
{
'url': source_url + 'vox1_dev_wav_partab',
'md5': 'bbfaaccefab65d82b21903e81a8a8020',
},
{
'url': source_url + 'vox1_dev_wav_partac',
'md5': '017d579a2a96a077f40042ec33e51512',
},
{
'url': source_url + 'vox1_dev_wav_partad',
'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
},
]
archieves_audio_test = [
{
'url': source_url + 'vox1_test_wav.zip',
'md5': '185fdc63c3c739954633d50379a3d102',
},
]
archieves_meta = [
{
'url':
'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
'md5':
'b73110731c9223c1461fe49cb48dddfc',
},
]
num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
sample_rate = 16000
meta_info = collections.namedtuple(
'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
base_path = os.path.join(DATA_HOME, 'vox1')
wav_path = os.path.join(base_path, 'wav')
meta_path = os.path.join(base_path, 'meta')
veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
csv_path = os.path.join(base_path, 'csv')
subsets = ['train', 'dev', 'enroll', 'test']
def __init__(
self,
subset: str='train',
feat_type: str='raw',
random_chunk: bool=True,
chunk_duration: float=3.0, # seconds
split_ratio: float=0.9, # train split ratio
seed: int=0,
target_dir: str=None,
vox2_base_path=None,
**kwargs):
"""VoxCeleb data prepare and get the specific dataset audio info
Args:
subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
random_chunk (bool, optional): random select a duration from audio. Defaults to True.
chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
"""
assert subset in self.subsets, \
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
self.subset = subset
self.spk_id2label = {}
self.feat_type = feat_type
self.feat_config = kwargs
self.random_chunk = random_chunk
self.chunk_duration = chunk_duration
self.split_ratio = split_ratio
self.target_dir = target_dir if target_dir else VoxCeleb.base_path
self.vox2_base_path = vox2_base_path
# if we set the target dir, we will change the vox data info data from base path to target dir
VoxCeleb.csv_path = os.path.join(
target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
VoxCeleb.meta_path = os.path.join(
target_dir, "voxceleb",
'meta') if target_dir else VoxCeleb.meta_path
VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
'veri_test2.txt')
# self._data = self._get_data()[:1000] # KP: Small dataset test.
self._data = self._get_data()
super(VoxCeleb, self).__init__()
# Set up a seed to reproduce training or predicting result.
# random.seed(seed)
def _get_data(self):
# Download audio files.
# We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
# so, we check the vox1/wav dir status
print(f"wav base path: {self.wav_path}")
if not os.path.isdir(self.wav_path):
print("start to download the voxceleb1 dataset")
download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip
self.archieves_audio_dev,
self.base_path,
decompress=False)
download_and_decompress( # download the vox1_test_wav.zip and unzip
self.archieves_audio_test,
self.base_path,
decompress=True)
# Download all parts and concatenate the files into one zip file.
dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
print(f'Concatenating all parts to: {dev_zipfile}')
os.system(
f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
)
# Extract all audio files of dev and test set.
decompress(dev_zipfile, self.base_path)
# Download meta files.
if not os.path.isdir(self.meta_path):
print("prepare the meta data")
download_and_decompress(
self.archieves_meta, self.meta_path, decompress=False)
# Data preparation.
if not os.path.isdir(self.csv_path):
os.makedirs(self.csv_path)
self.prepare_data()
data = []
print(
f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
)
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
for line in rf.readlines()[1:]:
audio_id, duration, wav, start, stop, spk_id = line.strip(
).split(',')
data.append(
self.meta_info(audio_id,
float(duration), wav,
int(start), int(stop), spk_id))
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
for line in f.readlines():
spk_id, label = line.strip().split(' ')
self.spk_id2label[spk_id] = int(label)
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = paddlespeech.audio.load(record['wav'])
# random select a chunk audio samples from the audio
if self.random_chunk:
num_wav_samples = waveform.shape[0]
num_chunk_samples = int(self.chunk_duration * sr)
start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
stop = start + num_chunk_samples
else:
start = record['start']
stop = record['stop']
waveform = waveform[start:stop]
assert self.feat_type in feat_funcs.keys(), \
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
feat_func = feat_funcs[self.feat_type]
feat = feat_func(
waveform, sr=sr, **self.feat_config) if feat_func else waveform
record.update({'feat': feat})
if self.subset in ['train',
'dev']: # Labels are available in train and dev.
record.update({'label': self.spk_id2label[record['spk_id']]})
return record
@staticmethod
def _get_chunks(seg_dur, audio_id, audio_duration):
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
chunk_lst = [
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
for i in range(num_chunks)
]
return chunk_lst
def _get_audio_info(self, wav_file: str,
split_chunks: bool) -> List[List[str]]:
waveform, sr = paddlespeech.audio.load(wav_file)
spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
audio_duration = waveform.shape[0] / sr
ret = []
if split_chunks: # Split into pieces of self.chunk_duration seconds.
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
audio_duration)
for chunk in uniq_chunks_list:
s, e = chunk.split("_")[-2:] # Timestamps of start and end
start_sample = int(float(s) * sr)
end_sample = int(float(e) * sr)
# id, duration, wav, start, stop, spk_id
ret.append([
chunk, audio_duration, wav_file, start_sample, end_sample,
spk_id
])
else: # Keep whole audio.
ret.append([
audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
])
return ret
def generate_csv(self,
wav_files: List[str],
output_file: str,
split_chunks: bool=True):
print(f'Generating csv: {output_file}')
header = ["id", "duration", "wav", "start", "stop", "spk_id"]
# Note: this may occurs c++ execption, but the program will execute fine
# so we can ignore the execption
with Pool(cpu_count()) as p:
infos = list(
tqdm(
p.imap(lambda x: self._get_audio_info(x, split_chunks),
wav_files),
total=len(wav_files)))
csv_lines = []
for info in infos:
csv_lines.extend(info)
with open(output_file, mode="w") as csv_f:
csv_writer = csv.writer(
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(header)
for line in csv_lines:
csv_writer.writerow(line)
def prepare_data(self):
# Audio of speakers in veri_test_file should not be included in training set.
print("start to prepare the data csv file")
enroll_files = set()
test_files = set()
# get the enroll and test audio file path
with open(self.veri_test_file, 'r') as f:
for line in f.readlines():
_, enrol_file, test_file = line.strip().split(' ')
enroll_files.add(os.path.join(self.wav_path, enrol_file))
test_files.add(os.path.join(self.wav_path, test_file))
enroll_files = sorted(enroll_files)
test_files = sorted(test_files)
# get the enroll and test speakers
test_spks = set()
for file in (enroll_files + test_files):
spk = file.split('/wav/')[1].split('/')[0]
test_spks.add(spk)
# get all the train and dev audios file path
audio_files = []
speakers = set()
print("Getting file list...")
for path in [self.wav_path, self.vox2_base_path]:
# if vox2 directory is not set and vox2 is not a directory
# we will not process this directory
if not path or not os.path.exists(path):
print(f"{path} is an invalid path, please check again, "
"and we will ignore the vox2 base path")
continue
for file in glob.glob(
os.path.join(path, "**", "*.wav"), recursive=True):
spk = file.split('/wav/')[1].split('/')[0]
if spk in test_spks:
continue
speakers.add(spk)
audio_files.append(file)
print(
f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
)
# encode the train and dev speakers label to spk_id2label.txt
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
for label, spk_id in enumerate(
sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2
f.write(f'{spk_id} {label}\n')
audio_files = sorted(audio_files)
random.shuffle(audio_files)
split_idx = int(self.split_ratio * len(audio_files))
# split_ratio to train
train_files, dev_files = audio_files[:split_idx], audio_files[
split_idx:]
self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
self.generate_csv(
enroll_files,
os.path.join(self.csv_path, 'enroll.csv'),
split_chunks=False)
self.generate_csv(
test_files,
os.path.join(self.csv_path, 'test.csv'),
split_chunks=False)
def __getitem__(self, idx):
return self._convert_to_record(idx)
def __len__(self):
return len(self._data)

@ -1,17 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .layers import LogMelSpectrogram
from .layers import MelSpectrogram
from .layers import MFCC
from .layers import Spectrogram

@ -1,328 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
from typing import Optional
from typing import Union
import paddle
import paddle.nn as nn
from paddle import Tensor
from ..functional import compute_fbank_matrix
from ..functional import create_dct
from ..functional import power_to_db
from ..functional.window import get_window
__all__ = [
'Spectrogram',
'MelSpectrogram',
'LogMelSpectrogram',
'MFCC',
]
class Spectrogram(nn.Layer):
"""Compute spectrogram of given signals, typically audio waveforms.
The spectorgram is defined as the complex norm of the short-time Fourier transformation.
Args:
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
dtype: str='float32') -> None:
super(Spectrogram, self).__init__()
assert power > 0, 'Power of spectrogram must be > 0.'
self.power = power
if win_length is None:
win_length = n_fft
self.fft_window = get_window(
window, win_length, fftbins=True, dtype=dtype)
self._stft = partial(
paddle.signal.stft,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=self.fft_window,
center=center,
pad_mode=pad_mode)
self.register_buffer('fft_window', self.fft_window)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
"""
stft = self._stft(x)
spectrogram = paddle.pow(paddle.abs(stft), self.power)
return spectrogram
class MelSpectrogram(nn.Layer):
"""Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
Args:
sr (int, optional): Sample rate. Defaults to 22050.
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
sr: int=22050,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
dtype: str='float32') -> None:
super(MelSpectrogram, self).__init__()
self._spectrogram = Spectrogram(
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
power=power,
center=center,
pad_mode=pad_mode,
dtype=dtype)
self.n_mels = n_mels
self.f_min = f_min
self.f_max = f_max
self.htk = htk
self.norm = norm
if f_max is None:
f_max = sr // 2
self.fbank_matrix = compute_fbank_matrix(
sr=sr,
n_fft=n_fft,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
htk=htk,
norm=norm,
dtype=dtype) # float64 for better numerical results
self.register_buffer('fbank_matrix', self.fbank_matrix)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
"""
spect_feature = self._spectrogram(x)
mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
return mel_feature
class LogMelSpectrogram(nn.Layer):
"""Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
Args:
sr (int, optional): Sample rate. Defaults to 22050.
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
sr: int=22050,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None,
dtype: str='float32') -> None:
super(LogMelSpectrogram, self).__init__()
self._melspectrogram = MelSpectrogram(
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
power=power,
center=center,
pad_mode=pad_mode,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
htk=htk,
norm=norm,
dtype=dtype)
self.ref_value = ref_value
self.amin = amin
self.top_db = top_db
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
"""
mel_feature = self._melspectrogram(x)
log_mel_feature = power_to_db(
mel_feature,
ref_value=self.ref_value,
amin=self.amin,
top_db=self.top_db)
return log_mel_feature
class MFCC(nn.Layer):
"""Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
Args:
sr (int, optional): Sample rate. Defaults to 22050.
n_mfcc (int, optional): [description]. Defaults to 40.
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
sr: int=22050,
n_mfcc: int=40,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None,
dtype: str=paddle.float32) -> None:
super(MFCC, self).__init__()
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
n_mfcc, n_mels)
self._log_melspectrogram = LogMelSpectrogram(
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
power=power,
center=center,
pad_mode=pad_mode,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
htk=htk,
norm=norm,
ref_value=ref_value,
amin=amin,
top_db=top_db,
dtype=dtype)
self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
self.register_buffer('dct_matrix', self.dct_matrix)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
"""
log_mel_feature = self._log_melspectrogram(x)
mfcc = paddle.matmul(
log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
(0, 2, 1)) # (B, n_mels, L)
return mfcc

@ -1,20 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .functional import compute_fbank_matrix
from .functional import create_dct
from .functional import fft_frequencies
from .functional import hz_to_mel
from .functional import mel_frequencies
from .functional import mel_to_hz
from .functional import power_to_db

@ -1,266 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from librosa(https://github.com/librosa/librosa)
import math
from typing import Optional
from typing import Union
import paddle
from paddle import Tensor
__all__ = [
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'fft_frequencies',
'compute_fbank_matrix',
'power_to_db',
'create_dct',
]
def hz_to_mel(freq: Union[Tensor, float],
htk: bool=False) -> Union[Tensor, float]:
"""Convert Hz to Mels.
Args:
freq (Union[Tensor, float]): The input tensor with arbitrary shape.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
Union[Tensor, float]: Frequency in mels.
"""
if htk:
if isinstance(freq, Tensor):
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
else:
return 2595.0 * math.log10(1.0 + freq / 700.0)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (freq - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = math.log(6.4) / 27.0 # step size for log region
if isinstance(freq, Tensor):
target = min_log_mel + paddle.log(
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
mask = (freq > min_log_hz).astype(freq.dtype)
mels = target * mask + mels * (
1 - mask) # will replace by masked_fill OP in future
else:
if freq >= min_log_hz:
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
return mels
def mel_to_hz(mel: Union[float, Tensor],
htk: bool=False) -> Union[float, Tensor]:
"""Convert mel bin numbers to frequencies.
Args:
mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
Union[float, Tensor]: Frequencies in Hz.
"""
if htk:
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mel
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = math.log(6.4) / 27.0 # step size for log region
if isinstance(mel, Tensor):
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
mask = (mel > min_log_mel).astype(mel.dtype)
freqs = target * mask + freqs * (
1 - mask) # will replace by masked_fill OP in future
else:
if mel >= min_log_mel:
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
return freqs
def mel_frequencies(n_mels: int=64,
f_min: float=0.0,
f_max: float=11025.0,
htk: bool=False,
dtype: str='float32') -> Tensor:
"""Compute mel frequencies.
Args:
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
htk (bool, optional): Use htk scaling. Defaults to False.
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
Returns:
Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(f_min, htk=htk)
max_mel = hz_to_mel(f_max, htk=htk)
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
freqs = mel_to_hz(mels, htk=htk)
return freqs
def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
"""Compute fourier frequencies.
Args:
sr (int): Sample rate.
n_fft (int): Number of fft bins.
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
Returns:
Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
"""
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
def compute_fbank_matrix(sr: int,
n_fft: int,
n_mels: int=64,
f_min: float=0.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
dtype: str='float32') -> Tensor:
"""Compute fbank matrix.
Args:
sr (int): Sample rate.
n_fft (int): Number of fft bins.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use htk scaling. Defaults to False.
norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
Returns:
Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
"""
if f_max is None:
f_max = float(sr) / 2
# Initialize the weights
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
#ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = paddle.maximum(
paddle.zeros_like(lower), paddle.minimum(lower, upper))
# Slaney-style mel is scaled to be approx constant energy per channel
if norm == 'slaney':
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm.unsqueeze(1)
elif isinstance(norm, int) or isinstance(norm, float):
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
return weights
def power_to_db(spect: Tensor,
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None) -> Tensor:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
Args:
spect (Tensor): STFT power spectrogram.
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): Minimum threshold. Defaults to 1e-10.
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
Returns:
Tensor: Power spectrogram in db scale.
"""
if amin <= 0:
raise Exception("amin must be strictly positive")
if ref_value <= 0:
raise Exception("ref_value must be strictly positive")
ones = paddle.ones_like(spect)
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
log_spec -= 10.0 * math.log10(max(ref_value, amin))
if top_db is not None:
if top_db < 0:
raise Exception("top_db must be non-negative")
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
return log_spec
def create_dct(n_mfcc: int,
n_mels: int,
norm: Optional[str]='ortho',
dtype: str='float32') -> Tensor:
"""Create a discrete cosine transform(DCT) matrix.
Args:
n_mfcc (int): Number of mel frequency cepstral coefficients.
n_mels (int): Number of mel filterbanks.
norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
Returns:
Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
"""
n = paddle.arange(n_mels, dtype=dtype)
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
k) # size (n_mfcc, n_mels)
if norm is None:
dct *= 2.0
else:
assert norm == "ortho"
dct[0] *= 1.0 / math.sqrt(2.0)
dct *= math.sqrt(2.0 / float(n_mels))
return dct.T

@ -1,337 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import math
from typing import List
from typing import Tuple
from typing import Union
import paddle
from paddle import Tensor
__all__ = [
'get_window',
]
def _cat(x: List[Tensor], data_type: str) -> Tensor:
l = [paddle.to_tensor(_, data_type) for _ in x]
return paddle.concat(l)
def _acosh(x: Union[Tensor, float]) -> Tensor:
if isinstance(x, float):
return math.log(x + math.sqrt(x**2 - 1))
return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
def _extend(M: int, sym: bool) -> bool:
"""Extend window by 1 sample if needed for DFT-even symmetry. """
if not sym:
return M + 1, True
else:
return M, False
def _len_guards(M: int) -> bool:
"""Handle small or incorrect window lengths. """
if int(M) != M or M < 0:
raise ValueError('Window length M must be a non-negative integer')
return M <= 1
def _truncate(w: Tensor, needed: bool) -> Tensor:
"""Truncate window by 1 sample if needed for DFT-even symmetry. """
if needed:
return w[:-1]
else:
return w
def _general_gaussian(M: int, p, sig, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a window with a generalized Gaussian shape.
This function is consistent with scipy.signal.windows.general_gaussian().
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
return _truncate(w, needs_trunc)
def _general_cosine(M: int, a: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a generic weighted sum of cosine terms window.
This function is consistent with scipy.signal.windows.general_cosine().
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
w = paddle.zeros((M, ), dtype=dtype)
for k in range(len(a)):
w += a[k] * paddle.cos(k * fac)
return _truncate(w, needs_trunc)
def _general_hamming(M: int, alpha: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a generalized Hamming window.
This function is consistent with scipy.signal.windows.general_hamming()
"""
return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
def _taylor(M: int,
nbar=4,
sll=30,
norm=True,
sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a Taylor window.
The Taylor window taper function approximates the Dolph-Chebyshev window's
constant sidelobe level for a parameterized number of near-in sidelobes.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
# Original text uses a negative sidelobe level parameter and then negates
# it in the calculation of B. To keep consistent with other methods we
# assume the sidelobe level parameter to be positive.
B = 10**(sll / 20)
A = _acosh(B) / math.pi
s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
ma = paddle.arange(1, nbar, dtype=dtype)
Fm = paddle.empty((nbar - 1, ), dtype=dtype)
signs = paddle.empty_like(ma)
signs[::2] = 1
signs[1::2] = -1
m2 = ma * ma
for mi in range(len(ma)):
numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
))
if mi == 0:
denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
elif mi == len(ma) - 1:
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
else:
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
mi] / m2[mi + 1:])
Fm[mi] = numer / denom
def W(n):
return 1 + 2 * paddle.matmul(
Fm.unsqueeze(0),
paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
w = W(paddle.arange(0, M, dtype=dtype))
# normalize (Note that this is not described in the original text [1])
if norm:
scale = 1.0 / W((M - 1) / 2)
w *= scale
w = w.squeeze()
return _truncate(w, needs_trunc)
def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Hamming window.
The Hamming window is a taper formed by using a raised cosine with
non-zero endpoints, optimized to minimize the nearest side lobe.
"""
return _general_hamming(M, 0.54, sym, dtype=dtype)
def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Hann window.
The Hann window is a taper formed by using a raised cosine or sine-squared
with ends that touch zero.
"""
return _general_hamming(M, 0.5, sym, dtype=dtype)
def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Tukey window.
The Tukey window is also known as a tapered cosine window.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
if alpha <= 0:
return paddle.ones((M, ), dtype=dtype)
elif alpha >= 1.0:
return hann(M, sym=sym)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype)
width = int(alpha * (M - 1) / 2.0)
n1 = n[0:width + 1]
n2 = n[width + 1:M - width - 1]
n3 = n[M - width - 1:]
w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
w2 = paddle.ones(n2.shape, dtype=dtype)
w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
(M - 1))))
w = paddle.concat([w1, w2, w3])
return _truncate(w, needs_trunc)
def _kaiser(M: int, beta: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a Kaiser window.
The Kaiser window is a taper formed by using a Bessel function.
"""
raise NotImplementedError()
def _gaussian(M: int, std: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a Gaussian window.
The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
sig2 = 2 * std * std
w = paddle.exp(-n**2 / sig2)
return _truncate(w, needs_trunc)
def _exponential(M: int,
center=None,
tau=1.,
sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute an exponential (or Poisson) window. """
if sym and center is not None:
raise ValueError("If sym==True, center must be None.")
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
if center is None:
center = (M - 1) / 2
n = paddle.arange(0, M, dtype=dtype)
w = paddle.exp(-paddle.abs(n - center) / tau)
return _truncate(w, needs_trunc)
def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a triangular window.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
if M % 2 == 0:
w = (2 * n - 1.0) / M
w = paddle.concat([w, w[::-1]])
else:
w = 2 * n / (M + 1.0)
w = paddle.concat([w, w[-2::-1]])
return _truncate(w, needs_trunc)
def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Bohman window.
The Bohman window is the autocorrelation of a cosine window.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
math.pi * fac)
w = _cat([0, w, 0], dtype)
return _truncate(w, needs_trunc)
def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Blackman window.
The Blackman window is a taper formed by using the first three terms of
a summation of cosines. It was designed to have close to the minimal
leakage possible. It is close to optimal, only slightly worse than a
Kaiser window.
"""
return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a window with a simple cosine shape.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
return _truncate(w, needs_trunc)
def get_window(window: Union[str, Tuple[str, float]],
win_length: int,
fftbins: bool=True,
dtype: str='float64') -> Tensor:
"""Return a window of a given length and type.
Args:
window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
win_length (int): Number of samples.
fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
dtype (str, optional): The data type of the return window. Defaults to 'float64'.
Returns:
Tensor: The window represented as a tensor.
"""
sym = not fftbins
args = ()
if isinstance(window, tuple):
winstr = window[0]
if len(window) > 1:
args = window[1:]
elif isinstance(window, str):
if window in ['gaussian', 'exponential']:
raise ValueError("The '" + window + "' window needs one or "
"more parameters -- pass a tuple.")
else:
winstr = window
else:
raise ValueError("%s as window type is not supported." %
str(type(window)))
try:
winfunc = eval('_' + winstr)
except KeyError as e:
raise ValueError("Unknown window type.") from e
params = (win_length, ) + args
kwargs = {'sym': sym}
return winfunc(*params, dtype=dtype, **kwargs)

@ -1,13 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -1,15 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .kaldi import fbank
from .kaldi import pitch

@ -1,132 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddlespeech
from paddlespeech.audio._internal import module_utils
__all__ = [
'fbank',
'pitch',
]
@module_utils.requires_kaldi()
def fbank(
wav,
samp_freq: int=16000,
frame_shift_ms: float=10.0,
frame_length_ms: float=25.0,
dither: float=0.0,
preemph_coeff: float=0.97,
remove_dc_offset: bool=True,
window_type: str='povey',
round_to_power_of_two: bool=True,
blackman_coeff: float=0.42,
snip_edges: bool=True,
allow_downsample: bool=False,
allow_upsample: bool=False,
max_feature_vectors: int=-1,
num_bins: int=23,
low_freq: float=20,
high_freq: float=0,
vtln_low: float=100,
vtln_high: float=-500,
debug_mel: bool=False,
htk_mode: bool=False,
use_energy: bool=False, # fbank opts
energy_floor: float=0.0,
raw_energy: bool=True,
htk_compat: bool=False,
use_log_fbank: bool=True,
use_power: bool=True):
frame_opts = paddlespeech.audio._paddleaudio.FrameExtractionOptions()
mel_opts = paddlespeech.audio._paddleaudio.MelBanksOptions()
fbank_opts = paddlespeech.audio._paddleaudio.FbankOptions()
frame_opts.samp_freq = samp_freq
frame_opts.frame_shift_ms = frame_shift_ms
frame_opts.frame_length_ms = frame_length_ms
frame_opts.dither = dither
frame_opts.preemph_coeff = preemph_coeff
frame_opts.remove_dc_offset = remove_dc_offset
frame_opts.window_type = window_type
frame_opts.round_to_power_of_two = round_to_power_of_two
frame_opts.blackman_coeff = blackman_coeff
frame_opts.snip_edges = snip_edges
frame_opts.allow_downsample = allow_downsample
frame_opts.allow_upsample = allow_upsample
frame_opts.max_feature_vectors = max_feature_vectors
mel_opts.num_bins = num_bins
mel_opts.low_freq = low_freq
mel_opts.high_freq = high_freq
mel_opts.vtln_low = vtln_low
mel_opts.vtln_high = vtln_high
mel_opts.debug_mel = debug_mel
mel_opts.htk_mode = htk_mode
fbank_opts.use_energy = use_energy
fbank_opts.energy_floor = energy_floor
fbank_opts.raw_energy = raw_energy
fbank_opts.htk_compat = htk_compat
fbank_opts.use_log_fbank = use_log_fbank
fbank_opts.use_power = use_power
feat = paddlespeech.audio._paddleaudio.ComputeFbank(frame_opts, mel_opts, fbank_opts, wav)
return feat
@module_utils.requires_kaldi()
def pitch(wav,
samp_freq: int=16000,
frame_shift_ms: float=10.0,
frame_length_ms: float=25.0,
preemph_coeff: float=0.0,
min_f0: int=50,
max_f0: int=400,
soft_min_f0: float=10.0,
penalty_factor: float=0.1,
lowpass_cutoff: int=1000,
resample_freq: int=4000,
delta_pitch: float=0.005,
nccf_ballast: int=7000,
lowpass_filter_width: int=1,
upsample_filter_width: int=5,
max_frames_latency: int=0,
frames_per_chunk: int=0,
simulate_first_pass_online: bool=False,
recompute_frame: int=500,
nccf_ballast_online: bool=False,
snip_edges: bool=True):
pitch_opts = paddlespeech.audio._paddleaudio.PitchExtractionOptions()
pitch_opts.samp_freq = samp_freq
pitch_opts.frame_shift_ms = frame_shift_ms
pitch_opts.frame_length_ms = frame_length_ms
pitch_opts.preemph_coeff = preemph_coeff
pitch_opts.min_f0 = min_f0
pitch_opts.max_f0 = max_f0
pitch_opts.soft_min_f0 = soft_min_f0
pitch_opts.penalty_factor = penalty_factor
pitch_opts.lowpass_cutoff = lowpass_cutoff
pitch_opts.resample_freq = resample_freq
pitch_opts.delta_pitch = delta_pitch
pitch_opts.nccf_ballast = nccf_ballast
pitch_opts.lowpass_filter_width = lowpass_filter_width
pitch_opts.upsample_filter_width = upsample_filter_width
pitch_opts.max_frames_latency = max_frames_latency
pitch_opts.frames_per_chunk = frames_per_chunk
pitch_opts.simulate_first_pass_online = simulate_first_pass_online
pitch_opts.recompute_frame = recompute_frame
pitch_opts.nccf_ballast_online = nccf_ballast_online
pitch_opts.snip_edges = snip_edges
pitch = paddlespeech.audio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
return pitch

@ -1,15 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .eer import compute_eer
from .eer import compute_minDCF

@ -1,100 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np
import paddle
from sklearn.metrics import roc_curve
def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
"""Compute EER and return score threshold.
Args:
labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
Returns:
List[float]: eer and the specific threshold
"""
fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
fnr = 1 - tpr
eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
return eer, eer_threshold
def compute_minDCF(positive_scores,
negative_scores,
c_miss=1.0,
c_fa=1.0,
p_target=0.01):
"""
This is modified from SpeechBrain
https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
Computes the minDCF metric normally used to evaluate speaker verification
systems. The min_DCF is the minimum of the following C_det function computed
within the defined threshold range:
C_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
where p_miss is the missing probability and p_fa is the probability of having
a false alarm.
Args:
positive_scores (Paddle.Tensor): The scores from entries of the same class.
negative_scores (Paddle.Tensor): The scores from entries of different classes.
c_miss (float, optional): Cost assigned to a missing error (default 1.0).
c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
p_target (float, optional): Prior probability of having a target (default 0.01).
Returns:
List[float]: min dcf and the specific threshold
"""
# Computing candidate thresholds
if len(positive_scores.shape) > 1:
positive_scores = positive_scores.squeeze()
if len(negative_scores.shape) > 1:
negative_scores = negative_scores.squeeze()
thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
thresholds = paddle.unique(thresholds)
# Adding intermediate thresholds
interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
# Computing False Rejection Rate (miss detection)
positive_scores = paddle.concat(
len(thresholds) * [positive_scores.unsqueeze(0)])
pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
p_miss = (pos_scores_threshold.sum(0)
).astype("float32") / positive_scores.shape[1]
del positive_scores
del pos_scores_threshold
# Computing False Acceptance Rate (false alarm)
negative_scores = paddle.concat(
len(thresholds) * [negative_scores.unsqueeze(0)])
neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
p_fa = (neg_scores_threshold.sum(0)
).astype("float32") / negative_scores.shape[1]
del negative_scores
del neg_scores_threshold
c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
c_min = paddle.min(c_det, axis=0)
min_index = paddle.argmin(c_det, axis=0)
return float(c_min), float(thresholds[min_index])

@ -1,25 +0,0 @@
from paddlespeech.audio._internal import module_utils as _mod_utils
from .sox_effects import (
apply_effects_file,
apply_effects_tensor,
effect_names,
init_sox_effects,
shutdown_sox_effects,
)
if _mod_utils.is_sox_available():
import atexit
init_sox_effects()
atexit.register(shutdown_sox_effects)
__all__ = [
"init_sox_effects",
"shutdown_sox_effects",
"effect_names",
"apply_effects_tensor",
"apply_effects_file",
]

@ -1,238 +0,0 @@
import os
from typing import List, Optional, Tuple
import paddle
import numpy
from paddlespeech.audio._internal import module_utils as _mod_utils
from paddlespeech.audio.utils.sox_utils import list_effects
from paddlespeech.audio import _paddleaudio as paddleaudio
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
@_mod_utils.requires_sox()
def init_sox_effects():
"""Initialize resources required to use sox effects.
Note:
You do not need to call this function manually. It is called automatically.
Once initialized, you do not need to call this function again across the multiple uses of
sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
again will result in error.
"""
paddleaudio.sox_effects_initialize_sox_effects()
@_mod_utils.requires_sox()
def shutdown_sox_effects():
"""Clean up resources required to use sox effects.
Note:
You do not need to call this function manually. It is called automatically.
It is safe to call this function multiple times.
Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
initializing again will result in error.
"""
paddleaudio.sox_effects_shutdown_sox_effects()
@_mod_utils.requires_sox()
def effect_names() -> List[str]:
"""Gets list of valid sox effect names
Returns:
List[str]: list of available effect names.
Example
>>> paddleaudio.sox_effects.effect_names()
['allpass', 'band', 'bandpass', ... ]
"""
return list(list_effects().keys())
@_mod_utils.requires_sox()
def apply_effects_tensor(
tensor: paddle.Tensor,
sample_rate: int,
effects: List[List[str]],
channels_first: bool = True,
) -> Tuple[paddle.Tensor, int]:
"""Apply sox effects to given Tensor
.. devices:: CPU
Note:
This function only works on CPU Tensors.
This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` command adds certain effects automatically (such as
``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
need to give ``rate`` effect with desired sampling rate.).
Args:
tensor (paddle.Tensor): Input 2D CPU Tensor.
sample_rate (int): Sample rate
effects (List[List[str]]): List of effects.
channels_first (bool, optional): Indicates if the input Tensor's dimension is
`[channels, time]` or `[time, channels]`
Returns:
(Tensor, int): Resulting Tensor and sample rate.
The resulting Tensor has the same ``dtype`` as the input Tensor, and
the same channels order. The shape of the Tensor can be different based on the
effects applied. Sample rate can also be different based on the effects applied.
Example - Basic usage
>>>
>>> # Defines the effects to apply
>>> effects = [
... ['gain', '-n'], # normalises to 0dB
... ['pitch', '5'], # 5 cent pitch shift
... ['rate', '8000'], # resample to 8000 Hz
... ]
>>>
>>> # Generate pseudo wave:
>>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
>>> sample_rate = 16000
>>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
>>> waveform.shape
paddle.Size([2, 16000])
>>> waveform
tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442],
[-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]])
>>>
>>> # Apply effects
>>> waveform, sample_rate = apply_effects_tensor(
... wave_form, sample_rate, effects, channels_first=True)
>>>
>>> # Check the result
>>> # The new waveform is sampling rate 8000, 1 second.
>>> # normalization and channel order are preserved
>>> waveform.shape
paddle.Size([2, 8000])
>>> waveform
tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110],
[ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]])
>>> sample_rate
8000
"""
tensor_np = tensor.numpy()
ret = paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate, effects, channels_first)
if ret is not None:
return (paddle.to_tensor(ret[0]), ret[1])
raise RuntimeError("Failed to apply sox effect")
@_mod_utils.requires_sox()
def apply_effects_file(
path: str,
effects: List[List[str]],
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[paddle.Tensor, int]:
"""Apply sox effects to the audio file and load the resulting data as Tensor
Note:
This function works in the way very similar to ``sox`` command, however there are slight
differences. For example, ``sox`` commnad adds certain effects automatically (such as
``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
rate and leave samples untouched.
Args:
path (path-like object or file-like object):
effects (List[List[str]]): List of effects.
normalize (bool, optional):
When ``True``, this function always return ``float32``, and sample values are
normalized to ``[-1.0, 1.0]``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type. This argument has no effect for formats other
than integer WAV type.
channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Override the format detection with the given format.
Providing the argument might help when libsox can not infer the format
from header or extension,
Returns:
(Tensor, int): Resulting Tensor and sample rate.
If ``normalize=True``, the resulting Tensor is always ``float32`` type.
If ``normalize=False`` and the input audio file is of integer WAV file, then the
resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
otherwise `[time, channel]`.
Example - Basic usage
>>>
>>> # Defines the effects to apply
>>> effects = [
... ['gain', '-n'], # normalises to 0dB
... ['pitch', '5'], # 5 cent pitch shift
... ['rate', '8000'], # resample to 8000 Hz
... ]
>>>
>>> # Apply effects and load data with channels_first=True
>>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
>>>
>>> # Check the result
>>> waveform.shape
paddle.Size([2, 8000])
>>> waveform
tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07,
-1.4761e-07, 1.8114e-07],
[-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07,
-5.6159e-07, 4.8103e-07]])
>>> sample_rate
8000
Example - Apply random speed perturbation to dataset
>>>
>>> # Load data from file, apply random speed perturbation
>>> class RandomPerturbationFile(paddle.utils.data.Dataset):
... \"\"\"Given flist, apply random speed perturbation
...
... Suppose all the input files are at least one second long.
... \"\"\"
... def __init__(self, flist: List[str], sample_rate: int):
... super().__init__()
... self.flist = flist
... self.sample_rate = sample_rate
...
... def __getitem__(self, index):
... speed = 0.5 + 1.5 * random.randn()
... effects = [
... ['gain', '-n', '-10'], # apply 10 db attenuation
... ['remix', '-'], # merge all the channels
... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds.
... ['rate', f'{self.sample_rate}'],
... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end
... ['trim', '0', '2'], # get the first 2 seconds
... ]
... waveform, _ = paddleaudio.sox_effects.apply_effects_file(
... self.flist[index], effects)
... return waveform
...
... def __len__(self):
... return len(self.flist)
...
>>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
>>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
>>> for batch in loader:
>>> pass
"""
if hasattr(path, "read"):
ret = paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format)
if ret is None:
raise RuntimeError("Failed to load audio from {}".format(path))
return (paddle.to_tensor(ret[0]), ret[1])
path = os.fspath(path)
ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
if ret is not None:
return (paddle.to_tensor(ret[0]), ret[1])
raise RuntimeError("Failed to load audio from {}".format(path))

@ -1,201 +0,0 @@
if (MSVC)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
################################################################################
# libpaddleaudio
################################################################################
set(
LIBPADDLEAUDIO_SOURCES
utils.cpp
)
set(
LIBPADDLEAUDIO_INCLUDE_DIRS
${PROJECT_SOURCE_DIR}
)
set(
LIBPADDLEAUDIO_LINK_LIBRARIES
)
set(
LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
#------------------------------------------------------------------------------#
# START OF CUSTOMIZATION LOGICS
#------------------------------------------------------------------------------#
if(BUILD_SOX)
list(
APPEND
LIBPADDLEAUDIO_LINK_LIBRARIES
libsox
)
list(
APPEND
LIBPADDLEAUDIO_SOURCES
#sox/io.cpp
#sox/utils.cpp
#sox/effects.cpp
#sox/effects_chain.cpp
#sox/types.cpp
)
list(
APPEND
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
INCLUDE_SOX
)
endif()
if(BUILD_KALDI)
list(
APPEND
LIBPADDLEAUDIO_LINK_LIBRARIES
libkaldi
)
list(
APPEND
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
INCLUDE_KALDI
COMPILE_WITHOUT_OPENFST
)
endif()
#------------------------------------------------------------------------------#
# END OF CUSTOMIZATION LOGICS
#------------------------------------------------------------------------------#
function (define_library name source include_dirs link_libraries compile_defs)
add_library(${name} SHARED ${source})
target_include_directories(${name} PRIVATE ${include_dirs})
target_link_libraries(${name} ${link_libraries})
target_compile_definitions(${name} PRIVATE ${compile_defs})
set_target_properties(${name} PROPERTIES PREFIX "")
if (MSVC)
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
endif(MSVC)
install(
TARGETS ${name}
LIBRARY DESTINATION lib
RUNTIME DESTINATION lib # For Windows
)
endfunction()
define_library(
libpaddleaudio
"${LIBPADDLEAUDIO_SOURCES}"
"${LIBPADDLEAUDIO_INCLUDE_DIRS}"
"${LIBPADDLEAUDIO_LINK_LIBRARIES}"
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
)
if (APPLE)
set(TORCHAUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
else()
set(TORCHAUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
endif()
################################################################################
# _paddleaudio.so
################################################################################
if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
if (WIN32)
find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
set(ADDITIONAL_ITEMS Python3::Python)
endif()
function(define_extension name sources include_dirs libraries definitions)
add_library(${name} SHARED ${sources})
target_compile_definitions(${name} PRIVATE "${definitions}")
target_include_directories(
${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
target_link_libraries(
${name}
${libraries}
${TORCH_PYTHON_LIBRARY}
${ADDITIONAL_ITEMS}
)
set_target_properties(${name} PROPERTIES PREFIX "")
if (MSVC)
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
endif(MSVC)
if (APPLE)
# https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
# https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
endif()
install(
TARGETS ${name}
LIBRARY DESTINATION .
RUNTIME DESTINATION . # For Windows
)
endfunction()
set(
EXTENSION_SOURCES
pybind/pybind.cpp
)
#----------------------------------------------------------------------------#
# START OF CUSTOMIZATION LOGICS
#----------------------------------------------------------------------------#
if(BUILD_SOX)
list(
APPEND
EXTENSION_SOURCES
pybind/sox/effects.cpp
pybind/sox/effects_chain.cpp
pybind/sox/io.cpp
pybind/sox/types.cpp
pybind/sox/utils.cpp
)
endif()
if(BUILD_KALDI)
list(
APPEND
EXTENSION_SOURCES
pybind/kaldi/kaldi_feature_wrapper.cc
pybind/kaldi/kaldi_feature.cc
)
endif()
#----------------------------------------------------------------------------#
# END OF CUSTOMIZATION LOGICS
#----------------------------------------------------------------------------#
define_extension(
_paddleaudio
"${EXTENSION_SOURCES}"
""
libpaddleaudio
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
)
# if(BUILD_CTC_DECODER)
# set(
# DECODER_EXTENSION_SOURCES
# decoder/bindings/pybind.cpp
# )
# define_extension(
# _paddleaudio_decoder
# "${DECODER_EXTENSION_SOURCES}"
# ""
# "libpaddleaudio_decoder"
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
# )
# endif()
# if(USE_FFMPEG)
# set(
# FFMPEG_EXTENSION_SOURCES
# ffmpeg/pybind/typedefs.cpp
# ffmpeg/pybind/pybind.cpp
# ffmpeg/pybind/stream_reader.cpp
# )
# define_extension(
# _paddleaudio_ffmpeg
# "${FFMPEG_EXTENSION_SOURCES}"
# "${FFMPEG_INCLUDE_DIRS}"
# "libpaddleaudio_ffmpeg"
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
# )
# endif()
endif()

@ -1,121 +0,0 @@
Creative Commons Legal Code
CC0 1.0 Universal
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
HEREUNDER.
Statement of Purpose
The laws of most jurisdictions throughout the world automatically confer
exclusive Copyright and Related Rights (defined below) upon the creator
and subsequent owner(s) (each and all, an "owner") of an original work of
authorship and/or a database (each, a "Work").
Certain owners wish to permanently relinquish those rights to a Work for
the purpose of contributing to a commons of creative, cultural and
scientific works ("Commons") that the public can reliably and without fear
of later claims of infringement build upon, modify, incorporate in other
works, reuse and redistribute as freely as possible in any form whatsoever
and for any purposes, including without limitation commercial purposes.
These owners may contribute to the Commons to promote the ideal of a free
culture and the further production of creative, cultural and scientific
works, or to gain reputation or greater distribution for their Work in
part through the use and efforts of others.
For these and/or other purposes and motivations, and without any
expectation of additional consideration or compensation, the person
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
is an owner of Copyright and Related Rights in the Work, voluntarily
elects to apply CC0 to the Work and publicly distribute the Work under its
terms, with knowledge of his or her Copyright and Related Rights in the
Work and the meaning and intended legal effect of CC0 on those rights.
1. Copyright and Related Rights. A Work made available under CC0 may be
protected by copyright and related or neighboring rights ("Copyright and
Related Rights"). Copyright and Related Rights include, but are not
limited to, the following:
i. the right to reproduce, adapt, distribute, perform, display,
communicate, and translate a Work;
ii. moral rights retained by the original author(s) and/or performer(s);
iii. publicity and privacy rights pertaining to a person's image or
likeness depicted in a Work;
iv. rights protecting against unfair competition in regards to a Work,
subject to the limitations in paragraph 4(a), below;
v. rights protecting the extraction, dissemination, use and reuse of data
in a Work;
vi. database rights (such as those arising under Directive 96/9/EC of the
European Parliament and of the Council of 11 March 1996 on the legal
protection of databases, and under any national implementation
thereof, including any amended or successor version of such
directive); and
vii. other similar, equivalent or corresponding rights throughout the
world based on applicable law or treaty, and any national
implementations thereof.
2. Waiver. To the greatest extent permitted by, but not in contravention
of, applicable law, Affirmer hereby overtly, fully, permanently,
irrevocably and unconditionally waives, abandons, and surrenders all of
Affirmer's Copyright and Related Rights and associated claims and causes
of action, whether now known or unknown (including existing as well as
future claims and causes of action), in the Work (i) in all territories
worldwide, (ii) for the maximum duration provided by applicable law or
treaty (including future time extensions), (iii) in any current or future
medium and for any number of copies, and (iv) for any purpose whatsoever,
including without limitation commercial, advertising or promotional
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
member of the public at large and to the detriment of Affirmer's heirs and
successors, fully intending that such Waiver shall not be subject to
revocation, rescission, cancellation, termination, or any other legal or
equitable action to disrupt the quiet enjoyment of the Work by the public
as contemplated by Affirmer's express Statement of Purpose.
3. Public License Fallback. Should any part of the Waiver for any reason
be judged legally invalid or ineffective under applicable law, then the
Waiver shall be preserved to the maximum extent permitted taking into
account Affirmer's express Statement of Purpose. In addition, to the
extent the Waiver is so judged Affirmer hereby grants to each affected
person a royalty-free, non transferable, non sublicensable, non exclusive,
irrevocable and unconditional license to exercise Affirmer's Copyright and
Related Rights in the Work (i) in all territories worldwide, (ii) for the
maximum duration provided by applicable law or treaty (including future
time extensions), (iii) in any current or future medium and for any number
of copies, and (iv) for any purpose whatsoever, including without
limitation commercial, advertising or promotional purposes (the
"License"). The License shall be deemed effective as of the date CC0 was
applied by Affirmer to the Work. Should any part of the License for any
reason be judged legally invalid or ineffective under applicable law, such
partial invalidity or ineffectiveness shall not invalidate the remainder
of the License, and in such case Affirmer hereby affirms that he or she
will not (i) exercise any of his or her remaining Copyright and Related
Rights in the Work or (ii) assert any associated claims and causes of
action with respect to the Work, in either case contrary to Affirmer's
express Statement of Purpose.
4. Limitations and Disclaimers.
a. No trademark or patent rights held by Affirmer are waived, abandoned,
surrendered, licensed or otherwise affected by this document.
b. Affirmer offers the Work as-is and makes no representations or
warranties of any kind concerning the Work, express, implied,
statutory or otherwise, including without limitation warranties of
title, merchantability, fitness for a particular purpose, non
infringement, or the absence of latent or other defects, accuracy, or
the present or absence of errors, whether or not discoverable, all to
the greatest extent permissible under applicable law.
c. Affirmer disclaims responsibility for clearing rights of other persons
that may apply to the Work or any use thereof, including without
limitation any person's Copyright and Related Rights in the Work.
Further, Affirmer disclaims responsibility for obtaining any necessary
consents, permissions or other rights required for any use of the
Work.
d. Affirmer understands and acknowledges that Creative Commons is not a
party to this document and has no duty or obligation with respect to
this CC0 or use of the Work.

File diff suppressed because it is too large Load Diff

@ -1,49 +0,0 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/numpy.h"
#include "feat/feature-window.h"
namespace paddleaudio {
namespace kaldi {
namespace py = pybind11;
template <class F>
class StreamingFeatureTpl {
public:
typedef typename F::Options Options;
StreamingFeatureTpl(const Options& opts);
bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
::kaldi::Vector<::kaldi::BaseFloat>* feats);
void Reset() { remained_wav_.Resize(0); }
int Dim() { return computer_.Dim(); }
private:
bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
::kaldi::Vector<::kaldi::BaseFloat>* feats);
Options opts_;
::kaldi::FeatureWindowFunction window_function_;
::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
F computer_;
};
} // namespace kaldi
} // namespace ppspeech
#include "feature_common_inl.h"

@ -1,93 +0,0 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "base/kaldi-common.h"
namespace paddleaudio {
namespace kaldi {
template <class F>
StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
: opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
// window_function_(computer_.GetFrameOptions()) { the opt set to zero
}
template <class F>
bool StreamingFeatureTpl<F>::ComputeFeature(
const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
::kaldi::Vector<::kaldi::BaseFloat>* feats) {
// append remaned waves
::kaldi::int32 wav_len = wav.Dim();
if (wav_len == 0) return false;
::kaldi::int32 left_len = remained_wav_.Dim();
::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
waves.Range(0, left_len).CopyFromVec(remained_wav_);
waves.Range(left_len, wav_len).CopyFromVec(wav);
// cache remaned waves
::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
::kaldi::int32 frame_shift = frame_opts.WindowShift();
::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
remained_wav_.Resize(left_samples);
remained_wav_.CopyFromVec(
waves.Range(frame_shift * num_frames, left_samples));
// compute speech feature
Compute(waves, feats);
return true;
}
// Compute feat
template <class F>
bool StreamingFeatureTpl<F>::Compute(
const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
::kaldi::Vector<::kaldi::BaseFloat>* feats) {
::kaldi::BaseFloat vtln_warp = 1.0;
const ::kaldi::FrameExtractionOptions& frame_opts =
computer_.GetFrameOptions();
::kaldi::int32 num_samples = waves.Dim();
::kaldi::int32 frame_length = frame_opts.WindowSize();
::kaldi::int32 sample_rate = frame_opts.samp_freq;
if (num_samples < frame_length) {
return false;
}
::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
feats->Resize(num_frames * Dim());
::kaldi::Vector<::kaldi::BaseFloat> window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
::kaldi::BaseFloat raw_log_energy = 0.0;
::kaldi::ExtractWindow(0,
waves,
frame,
frame_opts,
window_function_,
&window,
need_raw_log_energy ? &raw_log_energy : NULL);
::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
::kaldi::kUndefined);
computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
::kaldi::SubVector<::kaldi::BaseFloat> output_row(
feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature);
}
return true;
}
} // namespace kaldi
} // namespace paddleaudio

@ -1,75 +0,0 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
#include "feat/pitch-functions.h"
namespace paddleaudio {
namespace kaldi {
bool InitFbank(
::kaldi::FrameExtractionOptions frame_opts,
::kaldi::MelBanksOptions mel_opts,
FbankOptions fbank_opts) {
::kaldi::FbankOptions opts;
opts.frame_opts = frame_opts;
opts.mel_opts = mel_opts;
opts.use_energy = fbank_opts.use_energy;
opts.energy_floor = fbank_opts.energy_floor;
opts.raw_energy = fbank_opts.raw_energy;
opts.htk_compat = fbank_opts.htk_compat;
opts.use_log_fbank = fbank_opts.use_log_fbank;
opts.use_power = fbank_opts.use_power;
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
return true;
}
py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
return paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ComputeFbank(
wav);
}
py::array_t<float> ComputeFbank(
::kaldi::FrameExtractionOptions frame_opts,
::kaldi::MelBanksOptions mel_opts,
FbankOptions fbank_opts,
const py::array_t<float>& wav) {
InitFbank(frame_opts, mel_opts, fbank_opts);
py::array_t<float> result = ComputeFbankStreaming(wav);
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
return result;
}
void ResetFbank() {
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
}
py::array_t<float> ComputeKaldiPitch(
const ::kaldi::PitchExtractionOptions& opts,
const py::array_t<float>& wav) {
py::buffer_info info = wav.request();
::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
::kaldi::Matrix<::kaldi::BaseFloat> features;
::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
sizeof(float)*features.NumCols());
}
return result;
}
} // namespace kaldi
} // namespace paddleaudio

@ -1,64 +0,0 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <string>
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.h"
#include "feat/pitch-functions.h"
namespace py = pybind11;
namespace paddleaudio {
namespace kaldi {
struct FbankOptions{
bool use_energy; // append an extra dimension with energy to the filter banks
float energy_floor;
bool raw_energy; // If true, compute energy before preemphasis and windowing
bool htk_compat; // If true, put energy last (if using energy)
bool use_log_fbank; // if true (default), produce log-filterbank, else linear
bool use_power;
FbankOptions(): use_energy(false),
energy_floor(0.0),
raw_energy(true),
htk_compat(false),
use_log_fbank(true),
use_power(true) {}
};
bool InitFbank(
::kaldi::FrameExtractionOptions frame_opts,
::kaldi::MelBanksOptions mel_opts,
FbankOptions fbank_opts);
py::array_t<float> ComputeFbank(
::kaldi::FrameExtractionOptions frame_opts,
::kaldi::MelBanksOptions mel_opts,
FbankOptions fbank_opts,
const py::array_t<float>& wav);
py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
void ResetFbank();
py::array_t<float> ComputeKaldiPitch(
const ::kaldi::PitchExtractionOptions& opts,
const py::array_t<float>& wav);
} // namespace kaldi
} // namespace paddleaudio

@ -1,51 +0,0 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.h"
namespace paddleaudio {
namespace kaldi {
KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
static KaldiFeatureWrapper instance;
return &instance;
}
bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
fbank_.reset(new Fbank(opts));
return true;
}
py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
const py::array_t<float> wav) {
py::buffer_info info = wav.request();
::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
::kaldi::Vector<::kaldi::BaseFloat> feats;
bool flag = fbank_->ComputeFeature(input_wav, &feats);
if (flag == false || feats.Dim() == 0) return py::array_t<float>();
auto result = py::array_t<float>(feats.Dim());
py::buffer_info xs = result.request();
std::cout << std::endl;
float* res_ptr = (float*)xs.ptr;
for (int idx = 0; idx < feats.Dim(); ++idx) {
*res_ptr = feats(idx);
res_ptr++;
}
return result.reshape({feats.Dim() / Dim(), Dim()});
}
} // namesapce kaldi
} // namespace paddleaudio

@ -1,40 +0,0 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "base/kaldi-common.h"
#include "feat/feature-fbank.h"
#include "paddlespeech/audio/src/pybind/kaldi/feature_common.h"
namespace paddleaudio {
namespace kaldi {
typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
class KaldiFeatureWrapper {
public:
static KaldiFeatureWrapper* GetInstance();
bool InitFbank(::kaldi::FbankOptions opts);
py::array_t<float> ComputeFbank(const py::array_t<float> wav);
int Dim() { return fbank_->Dim(); }
void ResetFbank() { fbank_->Reset(); }
private:
std::unique_ptr<paddleaudio::kaldi::Fbank> fbank_;
};
} // namespace kaldi
} // namespace paddleaudio

@ -1,144 +0,0 @@
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala), All rights reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
#include "paddlespeech/audio/src/pybind/sox/io.h"
#include "paddlespeech/audio/src/pybind/sox/effects.h"
#include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h"
#include <pybind11/stl.h>
#include <pybind11/pybind11.h>
// `tl::optional`
namespace pybind11 { namespace detail {
template <typename T>
struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
}}
PYBIND11_MODULE(_paddleaudio, m) {
#ifdef INCLUDE_SOX
m.def("get_info_file",
&paddleaudio::sox_io::get_info_file,
"Get metadata of audio file.");
// support obj later
m.def("get_info_fileobj",
&paddleaudio::sox_io::get_info_fileobj,
"Get metadata of audio in file object.");
m.def("load_audio_fileobj",
&paddleaudio::sox_io::load_audio_fileobj,
"Load audio from file object.");
m.def("save_audio_fileobj",
&paddleaudio::sox_io::save_audio_fileobj,
"Save audio to file obj.");
// sox io
m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
m.def(
"sox_io_load_audio_file",
&paddleaudio::sox_io::load_audio_file);
m.def(
"sox_io_save_audio_file",
&paddleaudio::sox_io::save_audio_file);
// sox utils
m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed);
m.def(
"sox_utils_set_verbosity",
&paddleaudio::sox_utils::set_verbosity);
m.def(
"sox_utils_set_use_threads",
&paddleaudio::sox_utils::set_use_threads);
m.def(
"sox_utils_set_buffer_size",
&paddleaudio::sox_utils::set_buffer_size);
m.def(
"sox_utils_list_effects",
&paddleaudio::sox_utils::list_effects);
m.def(
"sox_utils_list_read_formats",
&paddleaudio::sox_utils::list_read_formats);
m.def(
"sox_utils_list_write_formats",
&paddleaudio::sox_utils::list_write_formats);
m.def(
"sox_utils_get_buffer_size",
&paddleaudio::sox_utils::get_buffer_size);
// effect
m.def("apply_effects_fileobj",
&paddleaudio::sox_effects::apply_effects_fileobj,
"Decode audio data from file-like obj and apply effects.");
m.def("sox_effects_initialize_sox_effects",
&paddleaudio::sox_effects::initialize_sox_effects);
m.def(
"sox_effects_shutdown_sox_effects",
&paddleaudio::sox_effects::shutdown_sox_effects);
m.def(
"sox_effects_apply_effects_tensor",
&paddleaudio::sox_effects::apply_effects_tensor);
m.def(
"sox_effects_apply_effects_file",
&paddleaudio::sox_effects::apply_effects_file);
#endif
#ifdef INCLUDE_KALDI
m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
.def(py::init<>())
.def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
.def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
.def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
.def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
.def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
.def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
.def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
.def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
.def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
.def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
.def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
.def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
.def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
.def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
.def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
.def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
.def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
.def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
.def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
.def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
.def(py::init<>())
.def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
.def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)
.def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
.def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)
.def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)
.def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)
.def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
.def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)
.def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)
.def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
.def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
.def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
.def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
.def(py::init<>())
.def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
.def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
.def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
.def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
.def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
.def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
.def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
.def(py::init<>())
.def_readwrite("use_energy", &paddleaudio::kaldi::FbankOptions::use_energy)
.def_readwrite("energy_floor", &paddleaudio::kaldi::FbankOptions::energy_floor)
.def_readwrite("raw_energy", &paddleaudio::kaldi::FbankOptions::raw_energy)
.def_readwrite("htk_compat", &paddleaudio::kaldi::FbankOptions::htk_compat)
.def_readwrite("use_log_fbank", &paddleaudio::kaldi::FbankOptions::use_log_fbank)
.def_readwrite("use_power", &paddleaudio::kaldi::FbankOptions::use_power);
#endif
}

@ -1,257 +0,0 @@
#include <mutex>
#include <sox.h>
#include "paddlespeech/audio/src/pybind/sox/effects.h"
#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
#include "paddlespeech/audio/src/pybind/sox/utils.h"
using namespace paddleaudio::sox_utils;
namespace paddleaudio::sox_effects {
// Streaming decoding over file-like object is tricky because libsox operates on
// FILE pointer. The folloing is what `sox` and `play` commands do
// - file input -> FILE pointer
// - URL input -> call wget in suprocess and pipe the data -> FILE pointer
// - stdin -> FILE pointer
//
// We want to, instead, fetch byte strings chunk by chunk, consume them, and
// discard.
//
// Here is the approach
// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
// chunk of byte string
// This will perform header-based format detection, if necessary, then fill
// the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
// which returns FILE* which points the buffer of the provided byte string.
// 2. Each time sox reads a chunk from the FILE*, we update the underlying
// buffer in a way that it
// starts with unseen data, and append the new data read from the given
// fileobj. This will trick libsox as if it keeps reading from the FILE*
// continuously.
// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
auto apply_effects_fileobj(
py::object fileobj,
const std::vector<std::vector<std::string>>& effects,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
tl::optional<std::string> format)
-> tl::optional<std::tuple<py::array, int64_t>> {
// Prepare the buffer used throughout the lifecycle of SoxEffectChain.
//
// For certain format (such as FLAC), libsox keeps reading the content at
// the initialization unless it reaches EOF even when the header is properly
// parsed. (Making buffer size 8192, which is way bigger than the header,
// resulted in libsox consuming all the buffer content at the time it opens
// the file.) Therefore buffer has to always contain valid data, except after
// EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
// first check if there is enough data to fill the buffer. `read_fileobj`
// repeatedly calls `read` method until it receives the requested length of
// bytes or it reaches EOF. If we get bytes shorter than requested, that means
// the whole audio data are fetched.
//
// * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
const auto capacity = [&]() {
// NOTE:
// Use the abstraction provided by `libpaddleaudio` to access the global
// config defined by libsox. Directly using `sox_get_globals` function will
// end up retrieving the static variable defined in `_paddleaudio`, which is
// not correct.
const auto bufsiz = get_buffer_size();
const int64_t kDefaultCapacityInBytes = 256;
return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
: kDefaultCapacityInBytes;
}();
std::string buffer(capacity, '\0');
auto* in_buf = const_cast<char*>(buffer.data());
auto num_read = read_fileobj(&fileobj, capacity, in_buf);
// If the file is shorter than 256, then libsox cannot read the header.
auto in_buffer_size = (num_read > 256) ? num_read : 256;
// Open file (this starts reading the header)
// When opening a file there are two functions that can touches FILE*.
// * `auto_detect_format`
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
// * `startread` handler of detected format.
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
// To see the handler of a particular format, go to
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
// For example, voribs can be found
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
SoxFormat sf(sox_open_mem_read(
in_buf,
in_buffer_size,
/*signal=*/nullptr,
/*encoding=*/nullptr,
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
// In case of streamed data, length can be 0
if (static_cast<sox_format_t*>(sf) == nullptr ||
sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
return {};
}
// Prepare output buffer
std::vector<sox_sample_t> out_buffer;
out_buffer.reserve(sf->signal.length);
// Create and run SoxEffectsChain
const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
/*input_encoding=*/sf->encoding,
/*output_encoding=*/get_tensor_encodinginfo(dtype));
chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
for (const auto& effect : effects) {
chain.addEffect(effect);
}
chain.addOutputBuffer(&out_buffer);
chain.run();
// Create tensor from buffer
bool channels_first_ = channels_first.value_or(true);
auto tensor = convert_to_tensor(
/*buffer=*/out_buffer.data(),
/*num_samples=*/out_buffer.size(),
/*num_channels=*/chain.getOutputNumChannels(),
dtype,
normalize.value_or(true),
channels_first_);
return std::forward_as_tuple(
tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
}
namespace {
enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
std::mutex SOX_RESOUCE_STATE_MUTEX;
} // namespace
void initialize_sox_effects() {
const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
switch (SOX_RESOURCE_STATE) {
case NotInitialized:
if (sox_init() != SOX_SUCCESS) {
throw std::runtime_error("Failed to initialize sox effects.");
};
SOX_RESOURCE_STATE = Initialized;
break;
case Initialized:
break;
case ShutDown:
throw std::runtime_error(
"SoX Effects has been shut down. Cannot initialize again.");
}
};
void shutdown_sox_effects() {
const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
switch (SOX_RESOURCE_STATE) {
case NotInitialized:
throw std::runtime_error(
"SoX Effects is not initialized. Cannot shutdown.");
case Initialized:
if (sox_quit() != SOX_SUCCESS) {
throw std::runtime_error("Failed to initialize sox effects.");
};
SOX_RESOURCE_STATE = ShutDown;
break;
case ShutDown:
break;
}
}
auto apply_effects_tensor(
py::array waveform,
int64_t sample_rate,
const std::vector<std::vector<std::string>>& effects,
bool channels_first) -> std::tuple<py::array, int64_t> {
validate_input_tensor(waveform);
// Create SoxEffectsChain
const auto dtype = waveform.dtype();
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
/*input_encoding=*/get_tensor_encodinginfo(dtype),
/*output_encoding=*/get_tensor_encodinginfo(dtype));
// Prepare output buffer
std::vector<sox_sample_t> out_buffer;
out_buffer.reserve(waveform.size());
// Build and run effects chain
chain.addInputTensor(&waveform, sample_rate, channels_first);
for (const auto& effect : effects) {
chain.addEffect(effect);
}
chain.addOutputBuffer(&out_buffer);
chain.run();
// Create tensor from buffer
auto out_tensor = convert_to_tensor(
/*buffer=*/out_buffer.data(),
/*num_samples=*/out_buffer.size(),
/*num_channels=*/chain.getOutputNumChannels(),
dtype,
/*normalize=*/false,
channels_first);
return std::tuple<py::array, int64_t>(
out_tensor, chain.getOutputSampleRate());
}
auto apply_effects_file(
const std::string& path,
const std::vector<std::vector<std::string>>& effects,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format)
-> tl::optional<std::tuple<py::array, int64_t>> {
// Open input file
SoxFormat sf(sox_open_read(
path.c_str(),
/*signal=*/nullptr,
/*encoding=*/nullptr,
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
if (static_cast<sox_format_t*>(sf) == nullptr ||
sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
return {};
}
const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
// Prepare output
std::vector<sox_sample_t> out_buffer;
out_buffer.reserve(sf->signal.length);
// Create and run SoxEffectsChain
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
/*input_encoding=*/sf->encoding,
/*output_encoding=*/get_tensor_encodinginfo(dtype));
chain.addInputFile(sf);
for (const auto& effect : effects) {
chain.addEffect(effect);
}
chain.addOutputBuffer(&out_buffer);
chain.run();
// Create tensor from buffer
bool channels_first_ = channels_first.value_or(true);
auto tensor = convert_to_tensor(
/*buffer=*/out_buffer.data(),
/*num_samples=*/out_buffer.size(),
/*num_channels=*/chain.getOutputNumChannels(),
dtype,
normalize.value_or(true),
channels_first_);
return std::tuple<py::array, int64_t>(
tensor, chain.getOutputSampleRate());
}
} // namespace paddleaudio::sox_effects

@ -1,36 +0,0 @@
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "paddlespeech/audio/src/optional/optional.hpp"
namespace py = pybind11;
namespace paddleaudio::sox_effects {
auto apply_effects_fileobj(
py::object fileobj,
const std::vector<std::vector<std::string>>& effects,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
tl::optional<std::string> format)
-> tl::optional<std::tuple<py::array, int64_t>>;
void initialize_sox_effects();
void shutdown_sox_effects();
auto apply_effects_tensor(
py::array waveform,
int64_t sample_rate,
const std::vector<std::vector<std::string>>& effects,
bool channels_first) -> std::tuple<py::array, int64_t>;
auto apply_effects_file(
const std::string& path,
const std::vector<std::vector<std::string>>& effects,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format)
-> tl::optional<std::tuple<py::array, int64_t>>;
} // namespace paddleaudio::sox_effects

@ -1,595 +0,0 @@
#include <sox.h>
#include <iostream>
#include <vector>
#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
#include "paddlespeech/audio/src/pybind/sox/utils.h"
using namespace paddleaudio::sox_utils;
namespace paddleaudio::sox_effects_chain {
namespace {
/// helper classes for passing the location of input tensor and output buffer
///
/// drain/flow callback functions require plaing C style function signature and
/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
/// The following structs will be assigned to sox_effect_t::priv pointer which
/// gives sox_effect_t an access to input Tensor and output buffer object.
struct TensorInputPriv {
size_t index;
py::array* waveform;
int64_t sample_rate;
bool channels_first;
};
struct TensorOutputPriv {
std::vector<sox_sample_t>* buffer;
};
struct FileOutputPriv {
sox_format_t* sf;
};
/// Callback function to feed Tensor data to SoxEffectChain.
int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
// Retrieve the input Tensor and current index
auto priv = static_cast<TensorInputPriv*>(effp->priv);
auto index = priv->index;
auto tensor = *(priv->waveform);
auto num_channels = effp->out_signal.channels;
// Adjust the number of samples to read
const size_t num_samples = tensor.size();
if (index + *osamp > num_samples) {
*osamp = num_samples - index;
}
// Ensure that it's a multiple of the number of channels
*osamp -= *osamp % num_channels;
// Slice the input Tensor
// refacor this module, chunk
auto i_frame = index / num_channels;
auto num_frames = *osamp / num_channels;
std::vector<int> chunk(num_frames*num_channels);
py::buffer_info ori_info = tensor.request();
void* ptr = ori_info.ptr;
// Convert to sox_sample_t (int32_t)
switch (tensor.dtype().num()) {
//case c10::ScalarType::Float: {
case 11: {
// Need to convert to 64-bit precision so that
// values around INT32_MIN/MAX are handled correctly.
for (int idx = 0; idx < chunk.size(); ++idx) {
int frame_idx = (idx + index) / num_channels;
int channels_idx = (idx + index) % num_channels;
double elem = 0;
if (priv->channels_first) {
elem = *(float*)tensor.data(channels_idx, frame_idx);
} else {
elem = *(float*)tensor.data(frame_idx, channels_idx);
}
elem = elem * 2147483648.;
// *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
if (elem > INT32_MAX) {
chunk[idx] = INT32_MAX;
} else if (elem < INT32_MIN) {
chunk[idx] = INT32_MIN;
} else {
chunk[idx] = elem;
}
}
break;
}
//case c10::ScalarType::Int: {
case 5: {
for (int idx = 0; idx < chunk.size(); ++idx) {
int frame_idx = (idx + index) / num_channels;
int channels_idx = (idx + index) % num_channels;
int elem = 0;
if (priv->channels_first) {
elem = *(int*)tensor.data(channels_idx, frame_idx);
} else {
elem = *(int*)tensor.data(frame_idx, channels_idx);
}
chunk[idx] = elem;
}
break;
}
// case short
case 3: {
for (int idx = 0; idx < chunk.size(); ++idx) {
int frame_idx = (idx + index) / num_channels;
int channels_idx = (idx + index) % num_channels;
int16_t elem = 0;
if (priv->channels_first) {
elem = *(int16_t*)tensor.data(channels_idx, frame_idx);
} else {
elem = *(int16_t*)tensor.data(frame_idx, channels_idx);
}
chunk[idx] = elem * 65536;
}
break;
}
// case byte
case 1: {
for (int idx = 0; idx < chunk.size(); ++idx) {
int frame_idx = (idx + index) / num_channels;
int channels_idx = (idx + index) % num_channels;
int8_t elem = 0;
if (priv->channels_first) {
elem = *(int8_t*)tensor.data(channels_idx, frame_idx);
} else {
elem = *(int8_t*)tensor.data(frame_idx, channels_idx);
}
chunk[idx] = (elem - 128) * 16777216;
}
break;
}
default:
throw std::runtime_error("Unexpected dtype.");
}
// Write to buffer
memcpy(obuf, chunk.data(), *osamp * 4);
priv->index += *osamp;
return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
}
/// Callback function to fetch data from SoxEffectChain.
int tensor_output_flow(
sox_effect_t* effp,
sox_sample_t const* ibuf,
sox_sample_t* obuf LSX_UNUSED,
size_t* isamp,
size_t* osamp) {
*osamp = 0;
// Get output buffer
auto out_buffer = static_cast<TensorOutputPriv*>(effp->priv)->buffer;
// Append at the end
out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
return SOX_SUCCESS;
}
int file_output_flow(
sox_effect_t* effp,
sox_sample_t const* ibuf,
sox_sample_t* obuf LSX_UNUSED,
size_t* isamp,
size_t* osamp) {
*osamp = 0;
if (*isamp) {
auto sf = static_cast<FileOutputPriv*>(effp->priv)->sf;
if (sox_write(sf, ibuf, *isamp) != *isamp) {
if (sf->sox_errno) {
std::ostringstream stream;
stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
<< sf->filename;
throw std::runtime_error(stream.str());
}
return SOX_EOF;
}
}
return SOX_SUCCESS;
}
sox_effect_handler_t* get_tensor_input_handler() {
static sox_effect_handler_t handler{
/*name=*/"input_tensor",
/*usage=*/NULL,
/*flags=*/SOX_EFF_MCHAN,
/*getopts=*/NULL,
/*start=*/NULL,
/*flow=*/NULL,
/*drain=*/tensor_input_drain,
/*stop=*/NULL,
/*kill=*/NULL,
/*priv_size=*/sizeof(TensorInputPriv)};
return &handler;
}
sox_effect_handler_t* get_tensor_output_handler() {
static sox_effect_handler_t handler{
/*name=*/"output_tensor",
/*usage=*/NULL,
/*flags=*/SOX_EFF_MCHAN,
/*getopts=*/NULL,
/*start=*/NULL,
/*flow=*/tensor_output_flow,
/*drain=*/NULL,
/*stop=*/NULL,
/*kill=*/NULL,
/*priv_size=*/sizeof(TensorOutputPriv)};
return &handler;
}
sox_effect_handler_t* get_file_output_handler() {
static sox_effect_handler_t handler{
/*name=*/"output_file",
/*usage=*/NULL,
/*flags=*/SOX_EFF_MCHAN,
/*getopts=*/NULL,
/*start=*/NULL,
/*flow=*/file_output_flow,
/*drain=*/NULL,
/*stop=*/NULL,
/*kill=*/NULL,
/*priv_size=*/sizeof(FileOutputPriv)};
return &handler;
}
} // namespace
SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
SoxEffect::~SoxEffect() {
if (se_ != nullptr) {
free(se_);
}
}
SoxEffect::operator sox_effect_t*() const {
return se_;
}
auto SoxEffect::operator->() noexcept -> sox_effect_t* {
return se_;
}
SoxEffectsChain::SoxEffectsChain(
sox_encodinginfo_t input_encoding,
sox_encodinginfo_t output_encoding)
: in_enc_(input_encoding),
out_enc_(output_encoding),
in_sig_(),
interm_sig_(),
out_sig_(),
sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
if (!sec_) {
throw std::runtime_error("Failed to create effect chain.");
}
}
SoxEffectsChain::~SoxEffectsChain() {
if (sec_ != nullptr) {
sox_delete_effects_chain(sec_);
}
}
void SoxEffectsChain::run() {
sox_flow_effects(sec_, NULL, NULL);
}
void SoxEffectsChain::addInputTensor(
py::array* waveform,
int64_t sample_rate,
bool channels_first) {
in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
interm_sig_ = in_sig_;
SoxEffect e(sox_create_effect(get_tensor_input_handler()));
auto priv = static_cast<TensorInputPriv*>(e->priv);
priv->index = 0;
priv->waveform = waveform;
priv->sample_rate = sample_rate;
priv->channels_first = channels_first;
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
throw std::runtime_error(
"Internal Error: Failed to add effect: input_tensor");
}
}
void SoxEffectsChain::addOutputBuffer(
std::vector<sox_sample_t>* output_buffer) {
SoxEffect e(sox_create_effect(get_tensor_output_handler()));
static_cast<TensorOutputPriv*>(e->priv)->buffer = output_buffer;
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
throw std::runtime_error(
"Internal Error: Failed to add effect: output_tensor");
}
}
void SoxEffectsChain::addInputFile(sox_format_t* sf) {
in_sig_ = sf->signal;
interm_sig_ = in_sig_;
SoxEffect e(sox_create_effect(sox_find_effect("input")));
char* opts[] = {(char*)sf};
sox_effect_options(e, 1, opts);
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
std::ostringstream stream;
stream << "Internal Error: Failed to add effect: input " << sf->filename;
throw std::runtime_error(stream.str());
}
}
void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
out_sig_ = sf->signal;
SoxEffect e(sox_create_effect(get_file_output_handler()));
static_cast<FileOutputPriv*>(e->priv)->sf = sf;
if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
std::ostringstream stream;
stream << "Internal Error: Failed to add effect: output " << sf->filename;
throw std::runtime_error(stream.str());
}
}
void SoxEffectsChain::addEffect(const std::vector<std::string> effect) {
const auto num_args = effect.size();
if (num_args == 0) {
throw std::runtime_error("Invalid argument: empty effect.");
}
const auto name = effect[0];
if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) {
std::ostringstream stream;
stream << "Unsupported effect: " << name;
throw std::runtime_error(stream.str());
}
auto returned_effect = sox_find_effect(name.c_str());
if (!returned_effect) {
std::ostringstream stream;
stream << "Unsupported effect: " << name;
throw std::runtime_error(stream.str());
}
SoxEffect e(sox_create_effect(returned_effect));
const auto num_options = num_args - 1;
std::vector<char*> opts;
for (size_t i = 1; i < num_args; ++i) {
opts.push_back((char*)effect[i].c_str());
}
if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) !=
SOX_SUCCESS) {
std::ostringstream stream;
stream << "Invalid effect option:";
for (const auto& v : effect) {
stream << " " << v;
}
throw std::runtime_error(stream.str());
}
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
std::ostringstream stream;
stream << "Internal Error: Failed to add effect: \"" << name;
for (size_t i = 1; i < num_args; ++i) {
stream << " " << effect[i];
}
stream << "\"";
throw std::runtime_error(stream.str());
}
}
int64_t SoxEffectsChain::getOutputNumChannels() {
return interm_sig_.channels;
}
int64_t SoxEffectsChain::getOutputSampleRate() {
return interm_sig_.rate;
}
namespace {
/// helper classes for passing file-like object to SoxEffectChain
struct FileObjInputPriv {
sox_format_t* sf;
py::object* fileobj;
bool eof_reached;
char* buffer;
uint64_t buffer_size;
};
struct FileObjOutputPriv {
sox_format_t* sf;
py::object* fileobj;
char** buffer;
size_t* buffer_size;
};
/// Callback function to feed byte string
/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
-> int {
auto priv = static_cast<FileObjInputPriv*>(effp->priv);
auto sf = priv->sf;
auto buffer = priv->buffer;
// 1. Refresh the buffer
//
// NOTE:
// Since the underlying FILE* was opened with `fmemopen`, the only way
// libsox detect EOF is reaching the end of the buffer. (null byte won't
// help) Therefore we need to align the content at the end of buffer,
// otherwise, libsox will keep reading the content beyond intended length.
//
// Before:
//
// |<-------consumed------>|<---remaining--->|
// |***********************|-----------------|
// ^ ftell
//
// After:
//
// |<-offset->|<---remaining--->|<-new data->|
// |**********|-----------------|++++++++++++|
// ^ ftell
// NOTE:
// Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
// supposed to be in sync, but there are cases (Vorbis) they are not
// in sync and `tell_off` has seemingly uninitialized value, which
// leads num_remain to be negative and cause segmentation fault
// in `memmove`.
const auto tell = ftell((FILE*)sf->fp);
if (tell < 0) {
throw std::runtime_error("Internal Error: ftell failed.");
}
const auto num_consumed = static_cast<size_t>(tell);
if (num_consumed > priv->buffer_size) {
throw std::runtime_error("Internal Error: buffer overrun.");
}
const auto num_remain = priv->buffer_size - num_consumed;
// 1.1. Fetch the data to see if there is data to fill the buffer
size_t num_refill = 0;
std::string chunk(num_consumed, '\0');
if (num_consumed && !priv->eof_reached) {
num_refill = read_fileobj(
priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
if (num_refill < num_consumed) {
priv->eof_reached = true;
}
}
const auto offset = num_consumed - num_refill;
// 1.2. Move the unconsumed data towards the beginning of buffer.
if (num_remain) {
auto src = static_cast<void*>(buffer + num_consumed);
auto dst = static_cast<void*>(buffer + offset);
memmove(dst, src, num_remain);
}
// 1.3. Refill the remaining buffer.
if (num_refill) {
auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
auto dst = buffer + offset + num_remain;
memcpy(dst, src, num_refill);
}
// 1.4. Set the file pointer to the new offset
sf->tell_off = offset;
fseek((FILE*)sf->fp, offset, SEEK_SET);
// 2. Perform decoding operation
// The following part is practically same as "input" effect
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
// At this point, osamp represents the buffer size in bytes,
// but sox_read expects the maximum number of samples ready to read.
// Normally, this is fine, but in case when the samples are not 4-byte
// aligned, (e.g. sample is 24bits), the resulting signal is not correct.
// https://github.com/pytorch/audio/issues/2083
if (sf->encoding.bits_per_sample > 0)
*osamp /= (sf->encoding.bits_per_sample / 8);
// Ensure that it's a multiple of the number of channels
*osamp -= *osamp % effp->out_signal.channels;
// Read up to *osamp samples into obuf;
// store the actual number read back to *osamp
*osamp = sox_read(sf, obuf, *osamp);
// Decoding is finished when fileobject is exhausted and sox can no longer
// decode a sample.
return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
}
auto fileobj_output_flow(
sox_effect_t* effp,
sox_sample_t const* ibuf,
sox_sample_t* obuf LSX_UNUSED,
size_t* isamp,
size_t* osamp) -> int {
*osamp = 0;
if (*isamp) {
auto priv = static_cast<FileObjOutputPriv*>(effp->priv);
auto sf = priv->sf;
auto fp = static_cast<FILE*>(sf->fp);
auto fileobj = priv->fileobj;
auto buffer = priv->buffer;
// Encode chunk
auto num_samples_written = sox_write(sf, ibuf, *isamp);
fflush(fp);
// Copy the encoded chunk to python object.
fileobj->attr("write")(py::bytes(*buffer, ftell(fp)));
// Reset FILE*
sf->tell_off = 0;
fseek(fp, 0, SEEK_SET);
if (num_samples_written != *isamp) {
if (sf->sox_errno) {
std::ostringstream stream;
stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
<< sf->filename;
throw std::runtime_error(stream.str());
}
return SOX_EOF;
}
}
return SOX_SUCCESS;
}
auto get_fileobj_input_handler() -> sox_effect_handler_t* {
static sox_effect_handler_t handler{
/*name=*/"input_fileobj_object",
/*usage=*/nullptr,
/*flags=*/SOX_EFF_MCHAN,
/*getopts=*/nullptr,
/*start=*/nullptr,
/*flow=*/nullptr,
/*drain=*/fileobj_input_drain,
/*stop=*/nullptr,
/*kill=*/nullptr,
/*priv_size=*/sizeof(FileObjInputPriv)};
return &handler;
}
auto get_fileobj_output_handler() -> sox_effect_handler_t* {
static sox_effect_handler_t handler{
/*name=*/"output_fileobj_object",
/*usage=*/nullptr,
/*flags=*/SOX_EFF_MCHAN,
/*getopts=*/nullptr,
/*start=*/nullptr,
/*flow=*/fileobj_output_flow,
/*drain=*/nullptr,
/*stop=*/nullptr,
/*kill=*/nullptr,
/*priv_size=*/sizeof(FileObjOutputPriv)};
return &handler;
}
} // namespace
void SoxEffectsChainPyBind::addInputFileObj(
sox_format_t* sf,
char* buffer,
uint64_t buffer_size,
py::object* fileobj) {
in_sig_ = sf->signal;
interm_sig_ = in_sig_;
SoxEffect e(sox_create_effect(get_fileobj_input_handler()));
auto priv = static_cast<FileObjInputPriv*>(e->priv);
priv->sf = sf;
priv->fileobj = fileobj;
priv->eof_reached = false;
priv->buffer = buffer;
priv->buffer_size = buffer_size;
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
throw std::runtime_error(
"Internal Error: Failed to add effect: input fileobj");
}
}
void SoxEffectsChainPyBind::addOutputFileObj(
sox_format_t* sf,
char** buffer,
size_t* buffer_size,
py::object* fileobj) {
out_sig_ = sf->signal;
SoxEffect e(sox_create_effect(get_fileobj_output_handler()));
auto priv = static_cast<FileObjOutputPriv*>(e->priv);
priv->sf = sf;
priv->fileobj = fileobj;
priv->buffer = buffer;
priv->buffer_size = buffer_size;
if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
throw std::runtime_error(
"Internal Error: Failed to add effect: output fileobj");
}
}
} // namespace paddleaudio::sox_effects_chain

@ -1,76 +0,0 @@
#pragma once
#include <sox.h>
#include "paddlespeech/audio/src/pybind/sox/utils.h"
namespace paddleaudio::sox_effects_chain {
// Helper struct to safely close sox_effect_t* pointer returned by
// sox_create_effect
struct SoxEffect {
explicit SoxEffect(sox_effect_t* se) noexcept;
SoxEffect(const SoxEffect& other) = delete;
SoxEffect(const SoxEffect&& other) = delete;
auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
~SoxEffect();
operator sox_effect_t*() const;
auto operator->() noexcept -> sox_effect_t*;
private:
sox_effect_t* se_;
};
// Helper struct to safely close sox_effects_chain_t with handy methods
class SoxEffectsChain {
const sox_encodinginfo_t in_enc_;
const sox_encodinginfo_t out_enc_;
protected:
sox_signalinfo_t in_sig_;
sox_signalinfo_t interm_sig_;
sox_signalinfo_t out_sig_;
sox_effects_chain_t* sec_;
public:
explicit SoxEffectsChain(
sox_encodinginfo_t input_encoding,
sox_encodinginfo_t output_encoding);
SoxEffectsChain(const SoxEffectsChain& other) = delete;
SoxEffectsChain(const SoxEffectsChain&& other) = delete;
SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
~SoxEffectsChain();
void run();
void addInputTensor(
py::array* waveform,
int64_t sample_rate,
bool channels_first);
void addInputFile(sox_format_t* sf);
void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
void addOutputFile(sox_format_t* sf);
void addEffect(const std::vector<std::string> effect);
int64_t getOutputNumChannels();
int64_t getOutputSampleRate();
};
class SoxEffectsChainPyBind : public SoxEffectsChain {
using SoxEffectsChain::SoxEffectsChain;
public:
void addInputFileObj(
sox_format_t* sf,
char* buffer,
uint64_t buffer_size,
py::object* fileobj);
void addOutputFileObj(
sox_format_t* sf,
char** buffer,
size_t* buffer_size,
py::object* fileobj);
};
} // namespace paddleaudio::sox_effects_chain

@ -1,280 +0,0 @@
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
// All rights reserved.
#include "paddlespeech/audio/src/pybind/sox/io.h"
#include "paddlespeech/audio/src/pybind/sox/effects.h"
#include "paddlespeech/audio/src/pybind/sox/types.h"
#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
#include "paddlespeech/audio/src/pybind/sox/utils.h"
#include "paddlespeech/audio/src/optional/optional.hpp"
using namespace paddleaudio::sox_utils;
namespace paddleaudio {
namespace sox_io {
auto get_info_file(const std::string &path,
const tl::optional<std::string> &format)
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
SoxFormat sf(
sox_open_read(path.data(),
/*signal=*/nullptr,
/*encoding=*/nullptr,
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
validate_input_file(sf, path);
return std::make_tuple(
static_cast<int64_t>(sf->signal.rate),
static_cast<int64_t>(sf->signal.length / sf->signal.channels),
static_cast<int64_t>(sf->signal.channels),
static_cast<int64_t>(sf->encoding.bits_per_sample),
get_encoding(sf->encoding.encoding));
}
std::vector<std::vector<std::string>> get_effects(
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames) {
const auto offset = frame_offset.value_or(0);
if (offset < 0) {
throw std::runtime_error(
"Invalid argument: frame_offset must be non-negative.");
}
const auto frames = num_frames.value_or(-1);
if (frames == 0 || frames < -1) {
throw std::runtime_error(
"Invalid argument: num_frames must be -1 or greater than 0.");
}
std::vector<std::vector<std::string>> effects;
if (frames != -1) {
std::ostringstream os_offset, os_frames;
os_offset << offset << "s";
os_frames << "+" << frames << "s";
effects.emplace_back(
std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
} else if (offset != 0) {
std::ostringstream os_offset;
os_offset << offset << "s";
effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
}
return effects;
}
auto get_info_fileobj(py::object fileobj,
const tl::optional<std::string> &format)
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
const auto capacity = [&]() {
const auto bufsiz = get_buffer_size();
const int64_t kDefaultCapacityInBytes = 4096;
return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
: kDefaultCapacityInBytes;
}();
std::string buffer(capacity, '\0');
auto *buf = const_cast<char *>(buffer.data());
auto num_read = read_fileobj(&fileobj, capacity, buf);
// If the file is shorter than 256, then libsox cannot read the header.
auto buf_size = (num_read > 256) ? num_read : 256;
SoxFormat sf(sox_open_mem_read(
buf,
buf_size,
/*signal=*/nullptr,
/*encoding=*/nullptr,
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
// In case of streamed data, length can be 0
validate_input_memfile(sf);
return std::make_tuple(
static_cast<int64_t>(sf->signal.rate),
static_cast<int64_t>(sf->signal.length / sf->signal.channels),
static_cast<int64_t>(sf->signal.channels),
static_cast<int64_t>(sf->encoding.bits_per_sample),
get_encoding(sf->encoding.encoding));
}
tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
py::object fileobj,
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format) {
auto effects = get_effects(frame_offset, num_frames);
return paddleaudio::sox_effects::apply_effects_fileobj(
std::move(fileobj), effects, normalize, channels_first, std::move(format));
}
tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
const std::string& path,
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format) {
auto effects = get_effects(frame_offset, num_frames);
return paddleaudio::sox_effects::apply_effects_file(
path, effects, normalize, channels_first, format);
}
void save_audio_file(const std::string& path,
py::array tensor,
int64_t sample_rate,
bool channels_first,
tl::optional<double> compression,
tl::optional<std::string> format,
tl::optional<std::string> encoding,
tl::optional<int64_t> bits_per_sample) {
validate_input_tensor(tensor);
const auto filetype = [&]() {
if (format.has_value()) return format.value();
return get_filetype(path);
}();
if (filetype == "amr-nb") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
//TORCH_CHECK(num_channels == 1,
// "amr-nb format only supports single channel audio.");
assert(num_channels == 1);
} else if (filetype == "htk") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
// TORCH_CHECK(num_channels == 1,
// "htk format only supports single channel audio.");
assert(num_channels == 1);
} else if (filetype == "gsm") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
assert(num_channels == 1);
assert(sample_rate == 8000);
//TORCH_CHECK(num_channels == 1,
// "gsm format only supports single channel audio.");
//TORCH_CHECK(sample_rate == 8000,
// "gsm format only supports a sampling rate of 8kHz.");
}
const auto signal_info =
get_signalinfo(&tensor, sample_rate, filetype, channels_first);
const auto encoding_info = get_encodinginfo_for_save(
filetype, tensor.dtype(), compression, encoding, bits_per_sample);
SoxFormat sf(sox_open_write(path.c_str(),
&signal_info,
&encoding_info,
/*filetype=*/filetype.c_str(),
/*oob=*/nullptr,
/*overwrite_permitted=*/nullptr));
if (static_cast<sox_format_t*>(sf) == nullptr) {
throw std::runtime_error(
"Error saving audio file: failed to open file " + path);
}
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
/*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
/*output_encoding=*/sf->encoding);
chain.addInputTensor(&tensor, sample_rate, channels_first);
chain.addOutputFile(sf);
chain.run();
}
namespace {
// helper class to automatically release buffer, to be used by
// save_audio_fileobj
struct AutoReleaseBuffer {
char* ptr;
size_t size;
AutoReleaseBuffer() : ptr(nullptr), size(0) {}
AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
~AutoReleaseBuffer() {
if (ptr) {
free(ptr);
}
}
};
} // namespace
void save_audio_fileobj(
py::object fileobj,
py::array tensor,
int64_t sample_rate,
bool channels_first,
tl::optional<double> compression,
tl::optional<std::string> format,
tl::optional<std::string> encoding,
tl::optional<int64_t> bits_per_sample) {
if (!format.has_value()) {
throw std::runtime_error(
"`format` is required when saving to file object.");
}
const auto filetype = format.value();
if (filetype == "amr-nb") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
if (num_channels != 1) {
throw std::runtime_error(
"amr-nb format only supports single channel audio.");
}
} else if (filetype == "htk") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
if (num_channels != 1) {
throw std::runtime_error(
"htk format only supports single channel audio.");
}
} else if (filetype == "gsm") {
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
if (num_channels != 1) {
throw std::runtime_error(
"gsm format only supports single channel audio.");
}
if (sample_rate != 8000) {
throw std::runtime_error(
"gsm format only supports a sampling rate of 8kHz.");
}
}
const auto signal_info =
get_signalinfo(&tensor, sample_rate, filetype, channels_first);
const auto encoding_info = get_encodinginfo_for_save(
filetype,
tensor.dtype(),
compression,
std::move(encoding),
bits_per_sample);
AutoReleaseBuffer buffer;
SoxFormat sf(sox_open_memstream_write(
&buffer.ptr,
&buffer.size,
&signal_info,
&encoding_info,
filetype.c_str(),
/*oob=*/nullptr));
if (static_cast<sox_format_t*>(sf) == nullptr) {
throw std::runtime_error(
"Error saving audio file: failed to open memory stream.");
}
paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
/*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
/*output_encoding=*/sf->encoding);
chain.addInputTensor(&tensor, sample_rate, channels_first);
chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
chain.run();
// Closing the sox_format_t is necessary for flushing the last chunk to the
// buffer
sf.close();
fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
}
} // namespace paddleaudio
} // namespace sox_io

@ -1,63 +0,0 @@
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
// All rights reserved.
#pragma once
#include "paddlespeech/audio/src/pybind/sox/utils.h"
namespace py = pybind11;
namespace paddleaudio {
namespace sox_io {
auto get_info_file(const std::string &path,
const tl::optional<std::string> &format)
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
auto get_info_fileobj(py::object fileobj,
const tl::optional<std::string> &format)
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
py::object fileobj,
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format);
void save_audio_fileobj(
py::object fileobj,
py::array tensor,
int64_t sample_rate,
bool channels_first,
tl::optional<double> compression,
tl::optional<std::string> format,
tl::optional<std::string> encoding,
tl::optional<int64_t> bits_per_sample);
auto get_effects(const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames)
-> std::vector<std::vector<std::string>>;
tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
const std::string& path,
const tl::optional<int64_t>& frame_offset,
const tl::optional<int64_t>& num_frames,
tl::optional<bool> normalize,
tl::optional<bool> channels_first,
const tl::optional<std::string>& format);
void save_audio_file(const std::string& path,
py::array tensor,
int64_t sample_rate,
bool channels_first,
tl::optional<double> compression,
tl::optional<std::string> format,
tl::optional<std::string> encoding,
tl::optional<int64_t> bits_per_sample);
} // namespace paddleaudio
} // namespace sox_io

@ -1,143 +0,0 @@
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
#include "paddlespeech/audio/src/pybind/sox/types.h"
#include <ostream>
#include <sstream>
namespace paddleaudio {
namespace sox_utils {
Format get_format_from_string(const std::string& format) {
if (format == "wav")
return Format::WAV;
if (format == "mp3")
return Format::MP3;
if (format == "flac")
return Format::FLAC;
if (format == "ogg" || format == "vorbis")
return Format::VORBIS;
if (format == "amr-nb")
return Format::AMR_NB;
if (format == "amr-wb")
return Format::AMR_WB;
if (format == "amb")
return Format::AMB;
if (format == "sph")
return Format::SPHERE;
if (format == "htk")
return Format::HTK;
if (format == "gsm")
return Format::GSM;
std::ostringstream stream;
stream << "Internal Error: unexpected format value: " << format;
throw std::runtime_error(stream.str());
}
std::string to_string(Encoding v) {
switch (v) {
case Encoding::UNKNOWN:
return "UNKNOWN";
case Encoding::PCM_SIGNED:
return "PCM_S";
case Encoding::PCM_UNSIGNED:
return "PCM_U";
case Encoding::PCM_FLOAT:
return "PCM_F";
case Encoding::FLAC:
return "FLAC";
case Encoding::ULAW:
return "ULAW";
case Encoding::ALAW:
return "ALAW";
case Encoding::MP3:
return "MP3";
case Encoding::VORBIS:
return "VORBIS";
case Encoding::AMR_WB:
return "AMR_WB";
case Encoding::AMR_NB:
return "AMR_NB";
case Encoding::OPUS:
return "OPUS";
default:
throw std::runtime_error("Internal Error: unexpected encoding.");
}
}
Encoding get_encoding_from_option(const tl::optional<std::string> encoding) {
if (!encoding.has_value())
return Encoding::NOT_PROVIDED;
std::string v = encoding.value();
if (v == "PCM_S")
return Encoding::PCM_SIGNED;
if (v == "PCM_U")
return Encoding::PCM_UNSIGNED;
if (v == "PCM_F")
return Encoding::PCM_FLOAT;
if (v == "ULAW")
return Encoding::ULAW;
if (v == "ALAW")
return Encoding::ALAW;
std::ostringstream stream;
stream << "Internal Error: unexpected encoding value: " << v;
throw std::runtime_error(stream.str());
}
BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth) {
if (!bit_depth.has_value())
return BitDepth::NOT_PROVIDED;
int64_t v = bit_depth.value();
switch (v) {
case 8:
return BitDepth::B8;
case 16:
return BitDepth::B16;
case 24:
return BitDepth::B24;
case 32:
return BitDepth::B32;
case 64:
return BitDepth::B64;
default: {
std::ostringstream s;
s << "Internal Error: unexpected bit depth value: " << v;
throw std::runtime_error(s.str());
}
}
}
std::string get_encoding(sox_encoding_t encoding) {
switch (encoding) {
case SOX_ENCODING_UNKNOWN:
return "UNKNOWN";
case SOX_ENCODING_SIGN2:
return "PCM_S";
case SOX_ENCODING_UNSIGNED:
return "PCM_U";
case SOX_ENCODING_FLOAT:
return "PCM_F";
case SOX_ENCODING_FLAC:
return "FLAC";
case SOX_ENCODING_ULAW:
return "ULAW";
case SOX_ENCODING_ALAW:
return "ALAW";
case SOX_ENCODING_MP3:
return "MP3";
case SOX_ENCODING_VORBIS:
return "VORBIS";
case SOX_ENCODING_AMR_WB:
return "AMR_WB";
case SOX_ENCODING_AMR_NB:
return "AMR_NB";
case SOX_ENCODING_OPUS:
return "OPUS";
case SOX_ENCODING_GSM:
return "GSM";
default:
return "UNKNOWN";
}
}
} // namespace sox_utils
} // namespace paddleaudio

@ -1,58 +0,0 @@
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
#pragma once
#include <sox.h>
#include "paddlespeech/audio/src/optional/optional.hpp"
namespace paddleaudio {
namespace sox_utils {
enum class Format {
WAV,
MP3,
FLAC,
VORBIS,
AMR_NB,
AMR_WB,
AMB,
SPHERE,
GSM,
HTK,
};
Format get_format_from_string(const std::string& format);
enum class Encoding {
NOT_PROVIDED,
UNKNOWN,
PCM_SIGNED,
PCM_UNSIGNED,
PCM_FLOAT,
FLAC,
ULAW,
ALAW,
MP3,
VORBIS,
AMR_WB,
AMR_NB,
OPUS,
};
std::string to_string(Encoding v);
Encoding get_encoding_from_option(const tl::optional<std::string> encoding);
enum class BitDepth : unsigned {
NOT_PROVIDED = 0,
B8 = 8,
B16 = 16,
B24 = 24,
B32 = 32,
B64 = 64,
};
BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth);
std::string get_encoding(sox_encoding_t encoding);
} // namespace sox_utils
} // namespace paddleaudio

@ -1,642 +0,0 @@
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
// All rights reserved.
#include <sox.h>
#include "paddlespeech/audio/src/pybind/sox/utils.h"
#include "paddlespeech/audio/src/pybind/sox/types.h"
#include <sstream>
namespace paddleaudio {
namespace sox_utils {
auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
-> uint64_t {
uint64_t num_read = 0;
while (num_read < size) {
auto request = size - num_read;
auto chunk = static_cast<std::string>(
static_cast<py::bytes>(fileobj->attr("read")(request)));
auto chunk_len = chunk.length();
if (chunk_len == 0) {
break;
}
if (chunk_len > request) {
std::ostringstream message;
message
<< "Requested up to " << request << " bytes but, "
<< "received " << chunk_len << " bytes. "
<< "The given object does not confirm to read protocol of file "
"object.";
throw std::runtime_error(message.str());
}
memcpy(buffer, chunk.data(), chunk_len);
buffer += chunk_len;
num_read += chunk_len;
}
return num_read;
}
void set_seed(const int64_t seed) {
sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
}
void set_verbosity(const int64_t verbosity) {
sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
}
void set_use_threads(const bool use_threads) {
sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
}
void set_buffer_size(const int64_t buffer_size) {
sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
}
int64_t get_buffer_size() {
return sox_get_globals()->bufsiz;
}
std::vector<std::vector<std::string>> list_effects() {
std::vector<std::vector<std::string>> effects;
for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
const sox_effect_handler_t* handler = (*fns)();
if (handler && handler->name) {
if (UNSUPPORTED_EFFECTS.find(handler->name) ==
UNSUPPORTED_EFFECTS.end()) {
effects.emplace_back(std::vector<std::string>{
handler->name,
handler->usage ? std::string(handler->usage) : std::string("")});
}
}
}
return effects;
}
std::vector<std::string> list_write_formats() {
std::vector<std::string> formats;
for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
const sox_format_handler_t* handler = fns->fn();
for (const char* const* names = handler->names; *names; ++names) {
if (!strchr(*names, '/') && handler->write)
formats.emplace_back(*names);
}
}
return formats;
}
std::vector<std::string> list_read_formats() {
std::vector<std::string> formats;
for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
const sox_format_handler_t* handler = fns->fn();
for (const char* const* names = handler->names; *names; ++names) {
if (!strchr(*names, '/') && handler->read)
formats.emplace_back(*names);
}
}
return formats;
}
SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
SoxFormat::~SoxFormat() {
close();
}
sox_format_t* SoxFormat::operator->() const noexcept {
return fd_;
}
SoxFormat::operator sox_format_t*() const noexcept {
return fd_;
}
void SoxFormat::close() {
if (fd_ != nullptr) {
sox_close(fd_);
fd_ = nullptr;
}
}
void validate_input_file(const SoxFormat& sf, const std::string& path) {
if (static_cast<sox_format_t*>(sf) == nullptr) {
throw std::runtime_error(
"Error loading audio file: failed to open file " + path);
}
if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
throw std::runtime_error("Error loading audio file: unknown encoding.");
}
}
void validate_input_memfile(const SoxFormat &sf) {
return validate_input_file(sf, "<in memory buffer>");
}
void validate_input_tensor(const py::array tensor) {
if (tensor.ndim() != 2) {
throw std::runtime_error("Input tensor has to be 2D.");
}
char dtype = tensor.dtype().char_();
bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i');
if (flag == false) {
throw std::runtime_error(
"Input tensor has to be one of float32, int32, int16 or uint8 type.");
}
}
py::dtype get_dtype(
const sox_encoding_t encoding,
const unsigned precision) {
switch (encoding) {
case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
return py::dtype('u1');
case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
switch (precision) {
case 16:
return py::dtype("i2");
case 24: // Cast 24-bit to 32-bit.
case 32:
return py::dtype('i');
default:
throw std::runtime_error(
"Only 16, 24, and 32 bits are supported for signed PCM.");
}
default:
// default to float32 for the other formats, including
// 32-bit flaoting-point WAV,
// MP3,
// FLAC,
// VORBIS etc...
return py::dtype("f");
}
}
py::array convert_to_tensor(
sox_sample_t* buffer,
const int32_t num_samples,
const int32_t num_channels,
const py::dtype dtype,
const bool normalize,
const bool channels_first) {
// todo refector later(SGoat)
py::array t;
uint64_t dummy = 0;
SOX_SAMPLE_LOCALS;
int32_t num_rows = num_samples / num_channels;
if (normalize || dtype.char_() == 'f') {
t = py::array(dtype, {num_rows, num_channels});
auto ptr = (float*)t.mutable_data(0, 0);
for (int32_t i = 0; i < num_samples; ++i) {
ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy);
}
if (channels_first) {
py::array t2 = py::array(dtype, {num_channels, num_rows});
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
*(float*)t2.mutable_data(row_idx, col_idx) = *(float*)t.data(col_idx, row_idx);
}
return t2;
}
} else if (dtype.char_() == 'i') {
t = py::array(dtype, {num_rows, num_channels});
auto ptr = (int*)t.mutable_data(0, 0);
for (int32_t i = 0; i < num_samples; ++i) {
ptr[i] = buffer[i];
}
if (channels_first) {
py::array t2 = py::array(dtype, {num_channels, num_rows});
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
*(int*)t2.mutable_data(row_idx, col_idx) = *(int*)t.data(col_idx, row_idx);
}
return t2;
}
} else if (dtype.char_() == 'h') { // int16
t = py::array(dtype, {num_rows, num_channels});
auto ptr = (int16_t*)t.mutable_data(0, 0);
for (int32_t i = 0; i < num_samples; ++i) {
ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy);
}
if (channels_first) {
py::array t2 = py::array(dtype, {num_channels, num_rows});
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
*(int16_t*)t2.mutable_data(row_idx, col_idx) = *(int16_t*)t.data(col_idx, row_idx);
}
return t2;
}
} else if (dtype.char_() == 'b') {
//t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
t = py::array(dtype, {num_rows, num_channels});
auto ptr = (uint8_t*)t.mutable_data(0,0);
for (int32_t i = 0; i < num_samples; ++i) {
ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy);
}
if (channels_first) {
py::array t2 = py::array(dtype, {num_channels, num_rows});
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
*(uint8_t*)t2.mutable_data(row_idx, col_idx) = *(uint8_t*)t.data(col_idx, row_idx);
}
return t2;
}
} else {
throw std::runtime_error("Unsupported dtype.");
}
return t;
}
const std::string get_filetype(const std::string path) {
std::string ext = path.substr(path.find_last_of(".") + 1);
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
return ext;
}
namespace {
std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
const std::string format,
py::dtype dtype,
const Encoding& encoding,
const BitDepth& bits_per_sample) {
switch (encoding) {
case Encoding::NOT_PROVIDED:
switch (bits_per_sample) {
case BitDepth::NOT_PROVIDED:
switch (dtype.num()) {
case 11: // float32 numpy dtype num
return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
case 5: // int numpy dtype num
return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
case 3: // int16 numpy
return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
case 1: // byte numpy
return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
default:
throw std::runtime_error("Internal Error: Unexpected dtype.");
}
case BitDepth::B8:
return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
default:
return std::make_tuple<>(
SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
}
case Encoding::PCM_SIGNED:
switch (bits_per_sample) {
case BitDepth::NOT_PROVIDED:
return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
case BitDepth::B8:
throw std::runtime_error(
format + " does not support 8-bit signed PCM encoding.");
default:
return std::make_tuple<>(
SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
}
case Encoding::PCM_UNSIGNED:
switch (bits_per_sample) {
case BitDepth::NOT_PROVIDED:
case BitDepth::B8:
return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
default:
throw std::runtime_error(
format + " only supports 8-bit for unsigned PCM encoding.");
}
case Encoding::PCM_FLOAT:
switch (bits_per_sample) {
case BitDepth::NOT_PROVIDED:
case BitDepth::B32:
return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
case BitDepth::B64:
return std::make_tuple<>(SOX_ENCODING_FLOAT, 64);
default:
throw std::runtime_error(
format +
" only supports 32-bit or 64-bit for floating-point PCM encoding.");
}
case Encoding::ULAW:
switch (bits_per_sample) {
case BitDepth::NOT_PROVIDED:
case BitDepth::B8:
return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
default:
throw std::runtime_error(
format + " only supports 8-bit for mu-law encoding.");
}
case Encoding::ALAW:
switch (bits_per_sample) {
case BitDepth::NOT_PROVIDED:
case BitDepth::B8:
return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
default:
throw std::runtime_error(
format + " only supports 8-bit for a-law encoding.");
}
default:
throw std::runtime_error(
format + " does not support encoding: " + to_string(encoding));
}
}
std::tuple<sox_encoding_t, unsigned> get_save_encoding(
const std::string& format,
const py::dtype dtype,
const tl::optional<std::string> encoding,
const tl::optional<int64_t> bits_per_sample) {
const Format fmt = get_format_from_string(format);
const Encoding enc = get_encoding_from_option(encoding);
const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
switch (fmt) {
case Format::WAV:
case Format::AMB:
return get_save_encoding_for_wav(format, dtype, enc, bps);
case Format::MP3:
if (enc != Encoding::NOT_PROVIDED)
throw std::runtime_error("mp3 does not support `encoding` option.");
if (bps != BitDepth::NOT_PROVIDED)
throw std::runtime_error(
"mp3 does not support `bits_per_sample` option.");
return std::make_tuple<>(SOX_ENCODING_MP3, 16);
case Format::HTK:
if (enc != Encoding::NOT_PROVIDED)
throw std::runtime_error("htk does not support `encoding` option.");
if (bps != BitDepth::NOT_PROVIDED)
throw std::runtime_error(
"htk does not support `bits_per_sample` option.");
return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
case Format::VORBIS:
if (enc != Encoding::NOT_PROVIDED)
throw std::runtime_error("vorbis does not support `encoding` option.");
if (bps != BitDepth::NOT_PROVIDED)
throw std::runtime_error(
"vorbis does not support `bits_per_sample` option.");
return std::make_tuple<>(SOX_ENCODING_VORBIS, 16);
case Format::AMR_NB:
if (enc != Encoding::NOT_PROVIDED)
throw std::runtime_error("amr-nb does not support `encoding` option.");
if (bps != BitDepth::NOT_PROVIDED)
throw std::runtime_error(
"amr-nb does not support `bits_per_sample` option.");
return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
case Format::FLAC:
if (enc != Encoding::NOT_PROVIDED)
throw std::runtime_error("flac does not support `encoding` option.");
switch (bps) {
case BitDepth::B32:
case BitDepth::B64:
throw std::runtime_error(
"flac does not support `bits_per_sample` larger than 24.");
default:
return std::make_tuple<>(
SOX_ENCODING_FLAC, static_cast<unsigned>(bps));
}
case Format::SPHERE:
switch (enc) {
case Encoding::NOT_PROVIDED:
case Encoding::PCM_SIGNED:
switch (bps) {
case BitDepth::NOT_PROVIDED:
return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
default:
return std::make_tuple<>(
SOX_ENCODING_SIGN2, static_cast<unsigned>(bps));
}
case Encoding::PCM_UNSIGNED:
throw std::runtime_error(
"sph does not support unsigned integer PCM.");
case Encoding::PCM_FLOAT:
throw std::runtime_error("sph does not support floating point PCM.");
case Encoding::ULAW:
switch (bps) {
case BitDepth::NOT_PROVIDED:
case BitDepth::B8:
return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
default:
throw std::runtime_error(
"sph only supports 8-bit for mu-law encoding.");
}
case Encoding::ALAW:
switch (bps) {
case BitDepth::NOT_PROVIDED:
case BitDepth::B8:
return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
default:
return std::make_tuple<>(
SOX_ENCODING_ALAW, static_cast<unsigned>(bps));
}
default:
throw std::runtime_error(
"sph does not support encoding: " + encoding.value());
}
case Format::GSM:
if (enc != Encoding::NOT_PROVIDED)
throw std::runtime_error("gsm does not support `encoding` option.");
if (bps != BitDepth::NOT_PROVIDED)
throw std::runtime_error(
"gsm does not support `bits_per_sample` option.");
return std::make_tuple<>(SOX_ENCODING_GSM, 16);
default:
throw std::runtime_error("Unsupported format: " + format);
}
}
unsigned get_precision(const std::string filetype, py::dtype dtype) {
if (filetype == "mp3")
return SOX_UNSPEC;
if (filetype == "flac")
return 24;
if (filetype == "ogg" || filetype == "vorbis")
return SOX_UNSPEC;
if (filetype == "wav" || filetype == "amb") {
switch (dtype.num()) {
case 1: // byte in numpy dype num
return 8;
case 3: // short, in numpy dtype num
return 16;
case 5: // int, numpy dtype
return 32;
case 11: // float, numpy dtype
return 32;
default:
throw std::runtime_error("Unsupported dtype.");
}
}
if (filetype == "sph")
return 32;
if (filetype == "amr-nb") {
return 16;
}
if (filetype == "gsm") {
return 16;
}
if (filetype == "htk") {
return 16;
}
throw std::runtime_error("Unsupported file type: " + filetype);
}
} // namespace
sox_signalinfo_t get_signalinfo(
const py::array* waveform,
const int64_t sample_rate,
const std::string filetype,
const bool channels_first) {
return sox_signalinfo_t{
/*rate=*/static_cast<sox_rate_t>(sample_rate),
/*channels=*/
static_cast<unsigned>(waveform->shape(channels_first ? 0 : 1)),
/*precision=*/get_precision(filetype, waveform->dtype()),
/*length=*/static_cast<uint64_t>(waveform->size())};
}
sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) {
sox_encoding_t encoding = [&]() {
switch (dtype.num()) {
case 1: // byte
return SOX_ENCODING_UNSIGNED;
case 3: // short
return SOX_ENCODING_SIGN2;
case 5: // int32
return SOX_ENCODING_SIGN2;
case 11: // float
return SOX_ENCODING_FLOAT;
default:
throw std::runtime_error("Unsupported dtype.");
}
}();
unsigned bits_per_sample = [&]() {
switch (dtype.num()) {
case 1: // byte
return 8;
case 3: //short
return 16;
case 5: // int32
return 32;
case 11: // float
return 32;
default:
throw std::runtime_error("Unsupported dtype.");
}
}();
return sox_encodinginfo_t{
/*encoding=*/encoding,
/*bits_per_sample=*/bits_per_sample,
/*compression=*/HUGE_VAL,
/*reverse_bytes=*/sox_option_default,
/*reverse_nibbles=*/sox_option_default,
/*reverse_bits=*/sox_option_default,
/*opposite_endian=*/sox_false};
}
sox_encodinginfo_t get_encodinginfo_for_save(
const std::string& format,
const py::dtype dtype,
const tl::optional<double> compression,
const tl::optional<std::string> encoding,
const tl::optional<int64_t> bits_per_sample) {
auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
return sox_encodinginfo_t{
/*encoding=*/std::get<0>(enc),
/*bits_per_sample=*/std::get<1>(enc),
/*compression=*/compression.value_or(HUGE_VAL),
/*reverse_bytes=*/sox_option_default,
/*reverse_nibbles=*/sox_option_default,
/*reverse_bits=*/sox_option_default,
/*opposite_endian=*/sox_false};
}
/*
SoxFormat::SoxFormat(sox_format_t *fd) noexcept : fd_(fd) {}
SoxFormat::~SoxFormat() { close(); }
sox_format_t *SoxFormat::operator->() const noexcept { return fd_; }
SoxFormat::operator sox_format_t *() const noexcept { return fd_; }
void SoxFormat::close() {
if (fd_ != nullptr) {
sox_close(fd_);
fd_ = nullptr;
}
}
auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
-> uint64_t {
uint64_t num_read = 0;
while (num_read < size) {
auto request = size - num_read;
auto chunk = static_cast<std::string>(
static_cast<py::bytes>(fileobj->attr("read")(request)));
auto chunk_len = chunk.length();
if (chunk_len == 0) {
break;
}
if (chunk_len > request) {
std::ostringstream message;
message
<< "Requested up to " << request << " bytes but, "
<< "received " << chunk_len << " bytes. "
<< "The given object does not confirm to read protocol of file "
"object.";
throw std::runtime_error(message.str());
}
memcpy(buffer, chunk.data(), chunk_len);
buffer += chunk_len;
num_read += chunk_len;
}
return num_read;
}
int64_t get_buffer_size() { return sox_get_globals()->bufsiz; }
void validate_input_file(const SoxFormat &sf, const std::string &path) {
if (static_cast<sox_format_t *>(sf) == nullptr) {
throw std::runtime_error(
"Error loading audio file: failed to open file " + path);
}
if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
throw std::runtime_error("Error loading audio file: unknown encoding.");
}
}
void validate_input_memfile(const SoxFormat &sf) {
return validate_input_file(sf, "<in memory buffer>");
}
std::string get_encoding(sox_encoding_t encoding) {
switch (encoding) {
case SOX_ENCODING_UNKNOWN:
return "UNKNOWN";
case SOX_ENCODING_SIGN2:
return "PCM_S";
case SOX_ENCODING_UNSIGNED:
return "PCM_U";
case SOX_ENCODING_FLOAT:
return "PCM_F";
case SOX_ENCODING_FLAC:
return "FLAC";
case SOX_ENCODING_ULAW:
return "ULAW";
case SOX_ENCODING_ALAW:
return "ALAW";
case SOX_ENCODING_MP3:
return "MP3";
case SOX_ENCODING_VORBIS:
return "VORBIS";
case SOX_ENCODING_AMR_WB:
return "AMR_WB";
case SOX_ENCODING_AMR_NB:
return "AMR_NB";
case SOX_ENCODING_OPUS:
return "OPUS";
case SOX_ENCODING_GSM:
return "GSM";
default:
return "UNKNOWN";
}
}
*/
} // namespace paddleaudio
} // namespace sox_utils

@ -1,116 +0,0 @@
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
// All rights reserved.
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <sox.h>
#include "paddlespeech/audio/src/optional/optional.hpp"
namespace py = pybind11;
namespace paddleaudio {
namespace sox_utils {
auto read_fileobj(py::object *fileobj, uint64_t size, char *buffer) -> uint64_t;
void set_seed(const int64_t seed);
void set_verbosity(const int64_t verbosity);
void set_use_threads(const bool use_threads);
void set_buffer_size(const int64_t buffer_size);
int64_t get_buffer_size();
std::vector<std::vector<std::string>> list_effects();
std::vector<std::string> list_read_formats();
std::vector<std::string> list_write_formats();
////////////////////////////////////////////////////////////////////////////////
// Utilities for sox_io / sox_effects implementations
////////////////////////////////////////////////////////////////////////////////
const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
{"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
/// helper class to automatically close sox_format_t*
struct SoxFormat {
explicit SoxFormat(sox_format_t* fd) noexcept;
SoxFormat(const SoxFormat& other) = delete;
SoxFormat(SoxFormat&& other) = delete;
SoxFormat& operator=(const SoxFormat& other) = delete;
SoxFormat& operator=(SoxFormat&& other) = delete;
~SoxFormat();
sox_format_t* operator->() const noexcept;
operator sox_format_t*() const noexcept;
void close();
private:
sox_format_t* fd_;
};
///
/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
void validate_input_tensor(const py::array);
void validate_input_file(const SoxFormat& sf, const std::string& path);
void validate_input_memfile(const SoxFormat &sf);
///
/// Get target dtype for the given encoding and precision.
py::dtype get_dtype(
const sox_encoding_t encoding,
const unsigned precision);
///
/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
/// NOTE: This function might modify the values in the input buffer to
/// reduce the number of memory copy.
/// @param buffer Pointer to buffer that contains audio data.
/// @param num_samples The number of samples to read.
/// @param num_channels The number of channels. Used to reshape the resulting
/// Tensor.
/// @param dtype Target dtype. Determines the output dtype and value range in
/// conjunction with normalization.
/// @param noramlize Perform normalization. Only effective when dtype is not
/// kFloat32. When effective, the output tensor is kFloat32 type and value range
/// is [-1.0, 1.0]
/// @param channels_first When True, output Tensor has shape of [num_channels,
/// num_frames].
py::array convert_to_tensor(
sox_sample_t* buffer,
const int32_t num_samples,
const int32_t num_channels,
const py::dtype dtype,
const bool normalize,
const bool channels_first);
/// Extract extension from file path
const std::string get_filetype(const std::string path);
/// Get sox_signalinfo_t for passing a py::array object.
sox_signalinfo_t get_signalinfo(
const py::array* waveform,
const int64_t sample_rate,
const std::string filetype,
const bool channels_first);
/// Get sox_encodinginfo_t for Tensor I/O
sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype);
/// Get sox_encodinginfo_t for saving to file/file object
sox_encodinginfo_t get_encodinginfo_for_save(
const std::string& format,
const py::dtype dtype,
const tl::optional<double> compression,
const tl::optional<std::string> encoding,
const tl::optional<int64_t> bits_per_sample);
} // namespace paddleaudio
} // namespace sox_utils

@ -1,33 +0,0 @@
namespace paddleaudio {
namespace {
bool is_sox_available() {
#ifdef INCLUDE_SOX
return true;
#else
return false;
#endif
}
bool is_kaldi_available() {
#ifdef INCLUDE_KALDI
return true;
#else
return false;
#endif
}
// It tells whether paddleaudio was compiled with ffmpeg
// not the runtime availability.
bool is_ffmpeg_available() {
#ifdef USE_FFMPEG
return true;
#else
return false;
#endif
}
} // namespace
} // namespace paddleaudio

@ -295,7 +295,7 @@ def torch_video(key, data):
def paddle_audio(key, data):
"""Decode audio using the paddlespeech.audio library.
"""Decode audio using the paddleaudio library.
:param key: file name extension
:param data: data to be decoded
@ -304,13 +304,13 @@ def paddle_audio(key, data):
if extension not in ["flac", "mp3", "sox", "wav", "m4a", "ogg", "wma"]:
return None
import paddlespeech.audio
import paddlesaudio
with tempfile.TemporaryDirectory() as dirname:
fname = os.path.join(dirname, f"file.{extension}")
with open(fname, "wb") as stream:
stream.write(data)
return paddlespeech.audio.load(fname)
return paddleaudio.backends.soundfile_load(fname)
################################################################

@ -25,8 +25,10 @@ import paddle
from . import autodecode
from . import utils
from .. import backends
from ..compliance import kaldi
from paddleaudio import backends
from paddleaudio.compliance import kaldi
from ..transform.cmvn import GlobalCMVN
from ..transform.spec_augment import freq_mask
from ..transform.spec_augment import time_mask

@ -20,7 +20,7 @@ trace = False
meta_prefix = "__"
meta_suffix = "__"
import paddlespeech
import paddleaudio
import paddle
import numpy as np
@ -111,7 +111,7 @@ def tar_file_iterator(fileobj,
assert pos > 0
prefix, postfix = name[:pos], name[pos + 1:]
if postfix == 'wav':
waveform, sample_rate = paddlespeech.audio.load(
waveform, sample_rate = paddleaudio.backends.soundfile_load(
stream.extractfile(tarinfo), normal=False)
result = dict(
fname=prefix, wav=waveform, sample_rate=sample_rate)
@ -163,7 +163,7 @@ def tar_file_and_group_iterator(fileobj,
if postfix == 'txt':
example['txt'] = file_obj.read().decode('utf8').strip()
elif postfix in AUDIO_FORMAT_SETS:
waveform, sample_rate = paddlespeech.audio.load(
waveform, sample_rate = paddleaudio.backends.soundfile_load(
file_obj, normal=False)
waveform = paddle.to_tensor(
np.expand_dims(np.array(waveform), 0),

@ -1,2 +0,0 @@
archives/
install/

@ -1,15 +0,0 @@
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
################################################################################
# sox
################################################################################
if (BUILD_SOX)
add_subdirectory(sox)
endif()
################################################################################
# kaldi
################################################################################
if (BUILD_KALDI)
add_subdirectory(kaldi)
endif()

@ -1,117 +0,0 @@
# checkout the thirdparty/kaldi/base/kaldi-types.h
# compile kaldi without openfst
add_definitions("-DCOMPILE_WITHOUT_OPENFST")
# function (define_library name source include_dirs link_libraries compile_defs)
# add_library(${name} INTERFACE ${source})
# target_include_directories(${name} INTERFACE ${include_dirs})
# target_link_libraries(${name} INTERFACE ${link_libraries})
# target_compile_definitions(${name} INTERFACE ${compile_defs})
# set_target_properties(${name} PROPERTIES PREFIX "")
# if (MSVC)
# set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
# endif(MSVC)
# install(
# TARGETS ${name}
# LIBRARY DESTINATION lib
# RUNTIME DESTINATION lib # For Windows
# )
# endfunction()
# kaldi-base
add_library(kaldi-base STATIC
base/io-funcs.cc
base/kaldi-error.cc
base/kaldi-math.cc
base/kaldi-utils.cc
base/timer.cc
)
target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
# kaldi-matrix
add_library(kaldi-matrix STATIC
matrix/compressed-matrix.cc
matrix/matrix-functions.cc
matrix/kaldi-matrix.cc
matrix/kaldi-vector.cc
matrix/optimization.cc
matrix/packed-matrix.cc
matrix/qr.cc
matrix/sparse-matrix.cc
matrix/sp-matrix.cc
matrix/srfft.cc
matrix/tp-matrix.cc
)
target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(kaldi-matrix PUBLIC gfortran kaldi-base libopenblas)
# kaldi-util
add_library(kaldi-util STATIC
util/kaldi-holder.cc
util/kaldi-io.cc
util/kaldi-semaphore.cc
util/kaldi-table.cc
util/kaldi-thread.cc
util/parse-options.cc
util/simple-io-funcs.cc
util/simple-options.cc
util/text-utils.cc
)
target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
# kaldi-feat-common
add_library(kaldi-feat-common STATIC
feat/cmvn.cc
feat/feature-functions.cc
feat/feature-window.cc
feat/mel-computations.cc
feat/pitch-functions.cc
feat/resample.cc
feat/signal.cc
feat/wave-reader.cc
)
target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
# kaldi-mfcc
add_library(kaldi-mfcc STATIC
feat/feature-mfcc.cc
)
target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
# kaldi-fbank
add_library(kaldi-fbank STATIC
feat/feature-fbank.cc
)
target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
set(KALDI_LIBRARIES
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
)
add_library(libkaldi INTERFACE)
add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(libkaldi INTERFACE
# --whole-archive for undefined symbol when link static lib into shared lib
-Wl,--start-group -Wl,--whole-archive
${KALDI_LIBRARIES}
libopenblas
gfortran
-Wl,--no-whole-archive -Wl,--end-group
)
target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")

@ -1 +0,0 @@
../../../../speechx/speechx/kaldi/base

@ -1 +0,0 @@
../../../../speechx/speechx/kaldi/feat

@ -1 +0,0 @@
../../../../speechx/speechx/kaldi/matrix

@ -1 +0,0 @@
../../../../speechx/speechx/kaldi/util

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,86 +0,0 @@
See the followings for the origin of this patch
http://www.linuxfromscratch.org/blfs/view/svn/multimedia/libmad.html
http://www.linuxfromscratch.org/patches/blfs/svn/libmad-0.15.1b-fixes-1.patch
--- src/libmad/configure 2004-02-05 09:34:07.000000000 +0000
+++ src/libmad/configure.new 2020-06-30 21:10:28.528018931 +0000
@@ -19083,71 +19083,7 @@
if test "$GCC" = yes
then
- if test -z "$arch"
- then
- case "$host" in
- i386-*) ;;
- i?86-*) arch="-march=i486" ;;
- arm*-empeg-*) arch="-march=armv4 -mtune=strongarm1100" ;;
- armv4*-*) arch="-march=armv4 -mtune=strongarm" ;;
- powerpc-*) ;;
- mips*-agenda-*) arch="-mcpu=vr4100" ;;
- mips*-luxsonor-*) arch="-mips1 -mcpu=r3000 -Wa,-m4010" ;;
- esac
- fi
-
- case "$optimize" in
- -O|"-O "*)
- optimize="-O"
- optimize="$optimize -fforce-mem"
- optimize="$optimize -fforce-addr"
- : #x optimize="$optimize -finline-functions"
- : #- optimize="$optimize -fstrength-reduce"
- optimize="$optimize -fthread-jumps"
- optimize="$optimize -fcse-follow-jumps"
- optimize="$optimize -fcse-skip-blocks"
- : #x optimize="$optimize -frerun-cse-after-loop"
- : #x optimize="$optimize -frerun-loop-opt"
- : #x optimize="$optimize -fgcse"
- optimize="$optimize -fexpensive-optimizations"
- optimize="$optimize -fregmove"
- : #* optimize="$optimize -fdelayed-branch"
- : #x optimize="$optimize -fschedule-insns"
- optimize="$optimize -fschedule-insns2"
- : #? optimize="$optimize -ffunction-sections"
- : #? optimize="$optimize -fcaller-saves"
- : #> optimize="$optimize -funroll-loops"
- : #> optimize="$optimize -funroll-all-loops"
- : #x optimize="$optimize -fmove-all-movables"
- : #x optimize="$optimize -freduce-all-givs"
- : #? optimize="$optimize -fstrict-aliasing"
- : #* optimize="$optimize -fstructure-noalias"
-
- case "$host" in
- arm*-*)
- optimize="$optimize -fstrength-reduce"
- ;;
- mips*-*)
- optimize="$optimize -fstrength-reduce"
- optimize="$optimize -finline-functions"
- ;;
- i?86-*)
- optimize="$optimize -fstrength-reduce"
- ;;
- powerpc-apple-*)
- # this triggers an internal compiler error with gcc2
- : #optimize="$optimize -fstrength-reduce"
-
- # this is really only beneficial with gcc3
- : #optimize="$optimize -finline-functions"
- ;;
- *)
- # this sometimes provokes bugs in gcc 2.95.2
- : #optimize="$optimize -fstrength-reduce"
- ;;
- esac
- ;;
- esac
+ optimize="-O2"
fi
case "$host" in
@@ -21497,6 +21433,7 @@
then
case "$host" in
i?86-*) FPM="INTEL" ;;
+ x86_64*) FPM="64BIT" ;;
arm*-*) FPM="ARM" ;;
mips*-*) FPM="MIPS" ;;
sparc*-*) FPM="SPARC" ;;

@ -1,16 +0,0 @@
See https://github.com/pytorch/audio/pull/1297
diff -ru sox/src/formats.c sox/src/formats.c
--- sox/src/formats.c 2014-10-26 19:55:50.000000000 -0700
+++ sox/src/formats.c 2021-02-22 16:01:02.833144070 -0800
@@ -333,6 +333,10 @@
assert(ft);
if (!ft->fp)
return sox_false;
- fstat(fileno((FILE*)ft->fp), &st);
+ int fd = fileno((FILE*)ft->fp);
+ if (fd < 0)
+ return sox_false;
+ if (fstat(fd, &st) < 0)
+ return sox_false;
return ((st.st_mode & S_IFMT) == S_IFREG);
}

@ -1,254 +0,0 @@
find_package(PkgConfig REQUIRED)
include(ExternalProject)
set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
set(patch_dir ${CMAKE_CURRENT_SOURCE_DIR}/../patches)
set(COMMON_ARGS --quiet --disable-shared --enable-static --prefix=${INSTALL_DIR} --with-pic --disable-dependency-tracking --disable-debug --disable-examples --disable-doc)
# To pass custom environment variables to ExternalProject_Add command,
# we need to do `${CMAKE_COMMAND} -E env ${envs} <COMMANAD>`.
# https://stackoverflow.com/a/62437353
# We constrcut the custom environment variables here
set(envs
"PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig"
"LDFLAGS=-L${INSTALL_DIR}/lib $ENV{LDFLAGS}"
"CFLAGS=-I${INSTALL_DIR}/include -fvisibility=hidden $ENV{CFLAGS}"
)
if (BUILD_MAD)
ExternalProject_Add(mad
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://downloads.sourceforge.net/project/mad/libmad/0.15.1b/libmad-0.15.1b.tar.gz
URL_HASH SHA256=bbfac3ed6bfbc2823d3775ebb931087371e142bb0e9bb1bee51a76a6e0078690
PATCH_COMMAND patch < ${patch_dir}/libmad.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/mad/
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/mad/configure ${COMMON_ARGS}
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
endif (BUILD_MAD)
ExternalProject_Add(amr
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://sourceforge.net/projects/opencore-amr/files/opencore-amr/opencore-amr-0.1.5.tar.gz
URL_HASH SHA256=2c006cb9d5f651bfb5e60156dbff6af3c9d35c7bbcc9015308c0aff1e14cd341
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/amr/
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/amr/configure ${COMMON_ARGS}
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
ExternalProject_Add(lame
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://downloads.sourceforge.net/project/lame/lame/3.99/lame-3.99.5.tar.gz
URL_HASH SHA256=24346b4158e4af3bd9f2e194bb23eb473c75fb7377011523353196b19b9a23ff
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/lame/
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/lame/configure ${COMMON_ARGS} --enable-nasm
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
ExternalProject_Add(ogg
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://ftp.osuosl.org/pub/xiph/releases/ogg/libogg-1.3.3.tar.gz
URL_HASH SHA256=c2e8a485110b97550f453226ec644ebac6cb29d1caef2902c007edab4308d985
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/configure ${COMMON_ARGS}
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
ExternalProject_Add(flac
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS ogg
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz
URL_HASH SHA256=91cfc3ed61dc40f47f050a109b08610667d73477af6ef36dcad31c31a4a8d53f
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/flac/
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/flac/configure ${COMMON_ARGS} --with-ogg --disable-cpplibs
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
ExternalProject_Add(vorbis
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS ogg
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://ftp.osuosl.org/pub/xiph/releases/vorbis/libvorbis-1.3.6.tar.gz
URL_HASH SHA256=6ed40e0241089a42c48604dc00e362beee00036af2d8b3f46338031c9e0351cb
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/configure ${COMMON_ARGS} --with-ogg
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
ExternalProject_Add(opus
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS ogg
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://ftp.osuosl.org/pub/xiph/releases/opus/opus-1.3.1.tar.gz
URL_HASH SHA256=65b58e1e25b2a114157014736a3d9dfeaad8d41be1c8179866f144a2fb44ff9d
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opus/
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opus/configure ${COMMON_ARGS} --with-ogg
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
ExternalProject_Add(opusfile
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS opus
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://ftp.osuosl.org/pub/xiph/releases/opus/opusfile-0.12.tar.gz
URL_HASH SHA256=118d8601c12dd6a44f52423e68ca9083cc9f2bfe72da7a8c1acb22a80ae3550b
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/configure ${COMMON_ARGS} --disable-http
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
# OpenMP is by default compiled against GNU OpenMP, which conflicts with the version of OpenMP that PyTorch uses.
# See https://github.com/pytorch/audio/pull/1026
# TODO: Add flags like https://github.com/suphoff/pytorch_parallel_extension_cpp/blob/master/setup.py
set(SOX_OPTIONS
--disable-openmp
--with-amrnb
--with-amrwb
--with-flac
--with-lame
--with-oggvorbis
--with-opus
--without-alsa
--without-ao
--without-coreaudio
--without-oss
--without-id3tag
--without-ladspa
--without-magic
--without-png
--without-pulseaudio
--without-sndfile
--without-sndio
--without-sunaudio
--without-waveaudio
--without-wavpack
--without-twolame
)
set(SOX_LIBRARIES
${INSTALL_DIR}/lib/libsox.a
${INSTALL_DIR}/lib/libopencore-amrnb.a
${INSTALL_DIR}/lib/libopencore-amrwb.a
${INSTALL_DIR}/lib/libmp3lame.a
${INSTALL_DIR}/lib/libFLAC.a
${INSTALL_DIR}/lib/libopusfile.a
${INSTALL_DIR}/lib/libopus.a
${INSTALL_DIR}/lib/libvorbisenc.a
${INSTALL_DIR}/lib/libvorbisfile.a
${INSTALL_DIR}/lib/libvorbis.a
${INSTALL_DIR}/lib/libogg.a
)
set(sox_depends
ogg flac vorbis opusfile lame amr
)
if (BUILD_MAD)
list(
APPEND
SOX_OPTIONS
--with-mad
)
list(
APPEND
SOX_LIBRARIES
${INSTALL_DIR}/lib/libmad.a
)
list(
APPEND
sox_depends
mad
)
else ()
list(
APPEND
SOX_OPTIONS
--without-mad
)
endif (BUILD_MAD)
ExternalProject_Add(sox
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS ${sox_depends}
DOWNLOAD_DIR ${ARCHIVE_DIR}
URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
PATCH_COMMAND patch -p1 < ${patch_dir}/sox.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/sox/
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/sox/configure ${COMMON_ARGS} ${SOX_OPTIONS}
BUILD_BYPRODUCTS ${SOX_LIBRARIES}
DOWNLOAD_NO_PROGRESS ON
LOG_DOWNLOAD ON
LOG_UPDATE ON
LOG_CONFIGURE ON
LOG_BUILD ON
LOG_INSTALL ON
LOG_MERGED_STDOUTERR ON
LOG_OUTPUT_ON_FAILURE ON
)
add_library(libsox INTERFACE)
add_dependencies(libsox sox)
target_include_directories(libsox INTERFACE ${INSTALL_DIR}/include)
target_link_libraries(libsox INTERFACE ${SOX_LIBRARIES})

@ -17,7 +17,7 @@ import numpy as np
import paddle
from python_speech_features import logfbank
from ..compliance import kaldi
from paddleaudio.compliance import kaldi
def stft(x,

@ -1,101 +0,0 @@
from typing import Dict, List
from paddlespeech.audio._internal import module_utils as _mod_utils
from paddlespeech.audio import _paddleaudio
@_mod_utils.requires_sox()
def set_seed(seed: int):
"""Set libsox's PRNG
Args:
seed (int): seed value. valid range is int32.
See Also:
http://sox.sourceforge.net/sox.html
"""
_paddleaudio.sox_utils_set_seed(seed)
@_mod_utils.requires_sox()
def set_verbosity(verbosity: int):
"""Set libsox's verbosity
Args:
verbosity (int): Set verbosity level of libsox.
* ``1`` failure messages
* ``2`` warnings
* ``3`` details of processing
* ``4``-``6`` increasing levels of debug messages
See Also:
http://sox.sourceforge.net/sox.html
"""
_paddleaudio.sox_utils_set_verbosity(verbosity)
@_mod_utils.requires_sox()
def set_buffer_size(buffer_size: int):
"""Set buffer size for sox effect chain
Args:
buffer_size (int): Set the size in bytes of the buffers used for processing audio.
See Also:
http://sox.sourceforge.net/sox.html
"""
_paddleaudio.sox_utils_set_buffer_size(buffer_size)
@_mod_utils.requires_sox()
def set_use_threads(use_threads: bool):
"""Set multithread option for sox effect chain
Args:
use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
See Also:
http://sox.sourceforge.net/sox.html
"""
_paddleaudio.sox_utils_set_use_threads(use_threads)
@_mod_utils.requires_sox()
def list_effects() -> Dict[str, str]:
"""List the available sox effect names
Returns:
Dict[str, str]: Mapping from ``effect name`` to ``usage``
"""
return dict(_paddleaudio.sox_utils_list_effects())
@_mod_utils.requires_sox()
def list_read_formats() -> List[str]:
"""List the supported audio formats for read
Returns:
List[str]: List of supported audio formats
"""
return _paddleaudio.sox_utils_list_read_formats()
@_mod_utils.requires_sox()
def list_write_formats() -> List[str]:
"""List the supported audio formats for write
Returns:
List[str]: List of supported audio formats
"""
return _paddleaudio.sox_utils_list_write_formats()
@_mod_utils.requires_sox()
def get_buffer_size() -> int:
"""Get buffer size for sox effect chain
Returns:
int: size in bytes of buffers used for processing audio.
"""
return _paddleaudio.sox_utils_get_buffer_size()

@ -1,38 +0,0 @@
# 1. Prepare
First, install `pytest-benchmark` via pip.
```sh
pip install pytest-benchmark
```
# 2. Run
Run the specific script for profiling.
```sh
pytest melspectrogram.py
```
Result:
```sh
========================================================================== test session starts ==========================================================================
platform linux -- Python 3.7.7, pytest-7.0.1, pluggy-1.0.0
benchmark: 3.4.1 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
plugins: typeguard-2.12.1, benchmark-3.4.1, anyio-3.5.0
collected 4 items
melspectrogram.py .... [100%]
-------------------------------------------------------------------------------------------------- benchmark: 4 tests -------------------------------------------------------------------------------------------------
Name (time in us) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_melspect_gpu_torchaudio 202.0765 (1.0) 360.6230 (1.0) 218.1168 (1.0) 16.3022 (1.0) 214.2871 (1.0) 21.8451 (1.0) 40;3 4,584.7001 (1.0) 286 1
test_melspect_gpu 657.8509 (3.26) 908.0470 (2.52) 724.2545 (3.32) 106.5771 (6.54) 669.9096 (3.13) 113.4719 (5.19) 1;0 1,380.7300 (0.30) 5 1
test_melspect_cpu_torchaudio 1,247.6053 (6.17) 2,892.5799 (8.02) 1,443.2853 (6.62) 345.3732 (21.19) 1,262.7263 (5.89) 221.6385 (10.15) 56;53 692.8637 (0.15) 399 1
test_melspect_cpu 20,326.2549 (100.59) 20,607.8682 (57.15) 20,473.4125 (93.86) 63.8654 (3.92) 20,467.0429 (95.51) 68.4294 (3.13) 8;1 48.8438 (0.01) 29 1
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Legend:
Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile.
OPS: Operations Per Second, computed as 1 / Mean
========================================================================== 4 passed in 21.12s ===========================================================================
```

@ -1,125 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import urllib.request
import librosa
import numpy as np
import paddle
import torch
import torchaudio
import paddlespeech.audio
wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
if not os.path.isfile(os.path.basename(wav_url)):
urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
waveform, sr = paddlespeech.audio.load(
os.path.abspath(os.path.basename(wav_url)))
waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
# Feature conf
mel_conf = {
'sr': sr,
'n_fft': 512,
'hop_length': 128,
'n_mels': 40,
}
mel_conf_torchaudio = {
'sample_rate': sr,
'n_fft': 512,
'hop_length': 128,
'n_mels': 40,
'norm': 'slaney',
'mel_scale': 'slaney',
}
def enable_cpu_device():
paddle.set_device('cpu')
def enable_gpu_device():
paddle.set_device('gpu')
log_mel_extractor = paddlespeech.audio.features.LogMelSpectrogram(
**mel_conf, f_min=0.0, top_db=80.0, dtype=waveform_tensor.dtype)
def log_melspectrogram():
return log_mel_extractor(waveform_tensor).squeeze(0)
def test_log_melspect_cpu(benchmark):
enable_cpu_device()
feature_audio = benchmark(log_melspectrogram)
feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
np.testing.assert_array_almost_equal(
feature_librosa, feature_audio, decimal=3)
def test_log_melspect_gpu(benchmark):
enable_gpu_device()
feature_audio = benchmark(log_melspectrogram)
feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
np.testing.assert_array_almost_equal(
feature_librosa, feature_audio, decimal=2)
mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram(
**mel_conf_torchaudio, f_min=0.0)
amplitude_to_DB = torchaudio.transforms.AmplitudeToDB('power', top_db=80.0)
def melspectrogram_torchaudio():
return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0)
def log_melspectrogram_torchaudio():
mel_specgram = mel_extractor_torchaudio(waveform_tensor_torch)
return amplitude_to_DB(mel_specgram).squeeze(0)
def test_log_melspect_cpu_torchaudio(benchmark):
global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB
mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu')
waveform_tensor_torch = waveform_tensor_torch.to('cpu')
amplitude_to_DB = amplitude_to_DB.to('cpu')
feature_audio = benchmark(log_melspectrogram_torchaudio)
feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
np.testing.assert_array_almost_equal(
feature_librosa, feature_audio, decimal=3)
def test_log_melspect_gpu_torchaudio(benchmark):
global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB
mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda')
waveform_tensor_torch = waveform_tensor_torch.to('cuda')
amplitude_to_DB = amplitude_to_DB.to('cuda')
feature_torchaudio = benchmark(log_melspectrogram_torchaudio)
feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
np.testing.assert_array_almost_equal(
feature_librosa, feature_torchaudio.cpu(), decimal=2)

@ -1,109 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import urllib.request
import librosa
import numpy as np
import paddle
import torch
import torchaudio
import paddlespeech.audio
wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
if not os.path.isfile(os.path.basename(wav_url)):
urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
waveform, sr = paddlespeech.audio.load(
os.path.abspath(os.path.basename(wav_url)))
waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
# Feature conf
mel_conf = {
'sr': sr,
'n_fft': 512,
'hop_length': 128,
'n_mels': 40,
}
mel_conf_torchaudio = {
'sample_rate': sr,
'n_fft': 512,
'hop_length': 128,
'n_mels': 40,
'norm': 'slaney',
'mel_scale': 'slaney',
}
def enable_cpu_device():
paddle.set_device('cpu')
def enable_gpu_device():
paddle.set_device('gpu')
mel_extractor = paddlespeech.audio.features.MelSpectrogram(
**mel_conf, f_min=0.0, dtype=waveform_tensor.dtype)
def melspectrogram():
return mel_extractor(waveform_tensor).squeeze(0)
def test_melspect_cpu(benchmark):
enable_cpu_device()
feature_audio = benchmark(melspectrogram)
feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
np.testing.assert_array_almost_equal(
feature_librosa, feature_audio, decimal=3)
def test_melspect_gpu(benchmark):
enable_gpu_device()
feature_audio = benchmark(melspectrogram)
feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
np.testing.assert_array_almost_equal(
feature_librosa, feature_audio, decimal=3)
mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram(
**mel_conf_torchaudio, f_min=0.0)
def melspectrogram_torchaudio():
return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0)
def test_melspect_cpu_torchaudio(benchmark):
global waveform_tensor_torch, mel_extractor_torchaudio
mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu')
waveform_tensor_torch = waveform_tensor_torch.to('cpu')
feature_audio = benchmark(melspectrogram_torchaudio)
feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
np.testing.assert_array_almost_equal(
feature_librosa, feature_audio, decimal=3)
def test_melspect_gpu_torchaudio(benchmark):
global waveform_tensor_torch, mel_extractor_torchaudio
mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda')
waveform_tensor_torch = waveform_tensor_torch.to('cuda')
feature_torchaudio = benchmark(melspectrogram_torchaudio)
feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
np.testing.assert_array_almost_equal(
feature_librosa, feature_torchaudio.cpu(), decimal=3)

@ -1,123 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import urllib.request
import librosa
import numpy as np
import paddle
import torch
import torchaudio
import paddlespeech.audio
wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
if not os.path.isfile(os.path.basename(wav_url)):
urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
waveform, sr = paddlespeech.audio.load(
os.path.abspath(os.path.basename(wav_url)))
waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
# Feature conf
mel_conf = {
'sr': sr,
'n_fft': 512,
'hop_length': 128,
'n_mels': 40,
}
mfcc_conf = {
'n_mfcc': 20,
'top_db': 80.0,
}
mfcc_conf.update(mel_conf)
mel_conf_torchaudio = {
'sample_rate': sr,
'n_fft': 512,
'hop_length': 128,
'n_mels': 40,
'norm': 'slaney',
'mel_scale': 'slaney',
}
mfcc_conf_torchaudio = {
'sample_rate': sr,
'n_mfcc': 20,
}
def enable_cpu_device():
paddle.set_device('cpu')
def enable_gpu_device():
paddle.set_device('gpu')
mfcc_extractor = paddlespeech.audio.features.MFCC(
**mfcc_conf, f_min=0.0, dtype=waveform_tensor.dtype)
def mfcc():
return mfcc_extractor(waveform_tensor).squeeze(0)
def test_mfcc_cpu(benchmark):
enable_cpu_device()
feature_audio = benchmark(mfcc)
feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
np.testing.assert_array_almost_equal(
feature_librosa, feature_audio, decimal=3)
def test_mfcc_gpu(benchmark):
enable_gpu_device()
feature_audio = benchmark(mfcc)
feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
np.testing.assert_array_almost_equal(
feature_librosa, feature_audio, decimal=3)
del mel_conf_torchaudio['sample_rate']
mfcc_extractor_torchaudio = torchaudio.transforms.MFCC(
**mfcc_conf_torchaudio, melkwargs=mel_conf_torchaudio)
def mfcc_torchaudio():
return mfcc_extractor_torchaudio(waveform_tensor_torch).squeeze(0)
def test_mfcc_cpu_torchaudio(benchmark):
global waveform_tensor_torch, mfcc_extractor_torchaudio
mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cpu')
waveform_tensor_torch = waveform_tensor_torch.to('cpu')
feature_audio = benchmark(mfcc_torchaudio)
feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
np.testing.assert_array_almost_equal(
feature_librosa, feature_audio, decimal=3)
def test_mfcc_gpu_torchaudio(benchmark):
global waveform_tensor_torch, mfcc_extractor_torchaudio
mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cuda')
waveform_tensor_torch = waveform_tensor_torch.to('cuda')
feature_torchaudio = benchmark(mfcc_torchaudio)
feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
np.testing.assert_array_almost_equal(
feature_librosa, feature_torchaudio.cpu(), decimal=3)

@ -1,13 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -1,34 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
import urllib.request
mono_channel_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
multi_channels_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav'
class BackendTest(unittest.TestCase):
def setUp(self):
self.initWavInput()
def initWavInput(self):
self.files = []
for url in [mono_channel_wav, multi_channels_wav]:
if not os.path.isfile(os.path.basename(url)):
urllib.request.urlretrieve(url, os.path.basename(url))
self.files.append(os.path.basename(url))
def initParmas(self):
raise NotImplementedError

@ -1,32 +0,0 @@
def get_encoding(ext, dtype):
exts = {
"mp3",
"flac",
"vorbis",
}
encodings = {
"float32": "PCM_F",
"int32": "PCM_S",
"int16": "PCM_S",
"uint8": "PCM_U",
}
return ext.upper() if ext in exts else encodings[dtype]
def get_bit_depth(dtype):
bit_depths = {
"float32": 32,
"int32": 32,
"int16": 16,
"uint8": 8,
}
return bit_depths[dtype]
def get_bits_per_sample(ext, dtype):
bits_per_samples = {
"flac": 24,
"mp3": 0,
"vorbis": 0,
}
return bits_per_samples.get(ext, get_bit_depth(dtype))

@ -1,13 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -1,57 +0,0 @@
import itertools
from unittest import skipIf
from parameterized import parameterized
from paddlespeech.audio._internal.module_utils import is_module_available
def name_func(func, _, params):
return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
def dtype2subtype(dtype):
return {
"float64": "DOUBLE",
"float32": "FLOAT",
"int32": "PCM_32",
"int16": "PCM_16",
"uint8": "PCM_U8",
"int8": "PCM_S8",
}[dtype]
def skipIfFormatNotSupported(fmt):
fmts = []
if is_module_available("soundfile"):
import soundfile
fmts = soundfile.available_formats()
return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile')
return skipIf(True, '"soundfile" not available.')
def parameterize(*params):
return parameterized.expand(list(itertools.product(*params)), name_func=name_func)
def fetch_wav_subtype(dtype, encoding, bits_per_sample):
subtype = {
(None, None): dtype2subtype(dtype),
(None, 8): "PCM_U8",
("PCM_U", None): "PCM_U8",
("PCM_U", 8): "PCM_U8",
("PCM_S", None): "PCM_32",
("PCM_S", 16): "PCM_16",
("PCM_S", 32): "PCM_32",
("PCM_F", None): "FLOAT",
("PCM_F", 32): "FLOAT",
("PCM_F", 64): "DOUBLE",
("ULAW", None): "ULAW",
("ULAW", 8): "ULAW",
("ALAW", None): "ALAW",
("ALAW", 8): "ALAW",
}.get((encoding, bits_per_sample))
if subtype:
return subtype
raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).")

@ -1,199 +0,0 @@
#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/info_test.py
import tarfile
import warnings
import unittest
from unittest.mock import patch
import paddle
from paddlespeech.audio._internal import module_utils as _mod_utils
from paddlespeech.audio.backends import soundfile_backend
from tests.unit.audio.backends.common import get_bits_per_sample, get_encoding
from tests.unit.common_utils import (
get_wav_data,
nested_params,
save_wav,
TempDirMixin,
)
from common import parameterize, skipIfFormatNotSupported
import soundfile
class TestInfo(TempDirMixin, unittest.TestCase):
@parameterize(
["float32", "int32"],
[8000, 16000],
[1, 2],
)
def test_wav(self, dtype, sample_rate, num_channels):
"""`soundfile_backend.info` can check wav file correctly"""
duration = 1
path = self.get_temp_path("data.wav")
data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate)
save_wav(path, data, sample_rate)
info = soundfile_backend.info(path)
assert info.sample_rate == sample_rate
assert info.num_frames == sample_rate * duration
assert info.num_channels == num_channels
assert info.bits_per_sample == get_bits_per_sample("wav", dtype)
assert info.encoding == get_encoding("wav", dtype)
@parameterize([8000, 16000], [1, 2])
@skipIfFormatNotSupported("FLAC")
def test_flac(self, sample_rate, num_channels):
"""`soundfile_backend.info` can check flac file correctly"""
duration = 1
num_frames = sample_rate * duration
#data = torch.randn(num_frames, num_channels).numpy()
data = paddle.randn(shape=[num_frames, num_channels]).numpy()
path = self.get_temp_path("data.flac")
soundfile.write(path, data, sample_rate)
info = soundfile_backend.info(path)
assert info.sample_rate == sample_rate
assert info.num_frames == num_frames
assert info.num_channels == num_channels
assert info.bits_per_sample == 16
assert info.encoding == "FLAC"
#@parameterize([8000, 16000], [1, 2])
#@skipIfFormatNotSupported("OGG")
#def test_ogg(self, sample_rate, num_channels):
#"""`soundfile_backend.info` can check ogg file correctly"""
#duration = 1
#num_frames = sample_rate * duration
##data = torch.randn(num_frames, num_channels).numpy()
#data = paddle.randn(shape=[num_frames, num_channels]).numpy()
#print(len(data))
#path = self.get_temp_path("data.ogg")
#soundfile.write(path, data, sample_rate)
#info = soundfile_backend.info(path)
#print(info)
#assert info.sample_rate == sample_rate
#print("info")
#print(info.num_frames)
#print("jiji")
#print(sample_rate*duration)
##assert info.num_frames == sample_rate * duration
#assert info.num_channels == num_channels
#assert info.bits_per_sample == 0
#assert info.encoding == "VORBIS"
@nested_params(
[8000, 16000],
[1, 2],
[("PCM_24", 24), ("PCM_32", 32)],
)
@skipIfFormatNotSupported("NIST")
def test_sphere(self, sample_rate, num_channels, subtype_and_bit_depth):
"""`soundfile_backend.info` can check sph file correctly"""
duration = 1
num_frames = sample_rate * duration
#data = torch.randn(num_frames, num_channels).numpy()
data = paddle.randn(shape=[num_frames, num_channels]).numpy()
path = self.get_temp_path("data.nist")
subtype, bits_per_sample = subtype_and_bit_depth
soundfile.write(path, data, sample_rate, subtype=subtype)
info = soundfile_backend.info(path)
assert info.sample_rate == sample_rate
assert info.num_frames == sample_rate * duration
assert info.num_channels == num_channels
assert info.bits_per_sample == bits_per_sample
assert info.encoding == "PCM_S"
def test_unknown_subtype_warning(self):
"""soundfile_backend.info issues a warning when the subtype is unknown
This will happen if a new subtype is supported in SoundFile: the _SUBTYPE_TO_BITS_PER_SAMPLE
dict should be updated.
"""
def _mock_info_func(_):
class MockSoundFileInfo:
samplerate = 8000
frames = 356
channels = 2
subtype = "UNSEEN_SUBTYPE"
format = "UNKNOWN"
return MockSoundFileInfo()
with patch("soundfile.info", _mock_info_func):
with warnings.catch_warnings(record=True) as w:
info = soundfile_backend.info("foo")
assert len(w) == 1
assert "UNSEEN_SUBTYPE subtype is unknown to PaddleAudio" in str(w[-1].message)
assert info.bits_per_sample == 0
class TestFileObject(TempDirMixin, unittest.TestCase):
def _test_fileobj(self, ext, subtype, bits_per_sample):
"""Query audio via file-like object works"""
duration = 2
sample_rate = 16000
num_channels = 2
num_frames = sample_rate * duration
path = self.get_temp_path(f"test.{ext}")
#data = torch.randn(num_frames, num_channels).numpy()
data = paddle.randn(shape=[num_frames, num_channels]).numpy()
soundfile.write(path, data, sample_rate, subtype=subtype)
with open(path, "rb") as fileobj:
info = soundfile_backend.info(fileobj)
assert info.sample_rate == sample_rate
assert info.num_frames == num_frames
assert info.num_channels == num_channels
assert info.bits_per_sample == bits_per_sample
assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
def test_fileobj_wav(self):
"""Loading audio via file-like object works"""
self._test_fileobj("wav", "PCM_16", 16)
@skipIfFormatNotSupported("FLAC")
def test_fileobj_flac(self):
"""Loading audio via file-like object works"""
self._test_fileobj("flac", "PCM_16", 16)
def _test_tarobj(self, ext, subtype, bits_per_sample):
"""Query compressed audio via file-like object works"""
duration = 2
sample_rate = 16000
num_channels = 2
num_frames = sample_rate * duration
audio_file = f"test.{ext}"
audio_path = self.get_temp_path(audio_file)
archive_path = self.get_temp_path("archive.tar.gz")
#data = torch.randn(num_frames, num_channels).numpy()
data = paddle.randn(shape=[num_frames, num_channels]).numpy()
soundfile.write(audio_path, data, sample_rate, subtype=subtype)
with tarfile.TarFile(archive_path, "w") as tarobj:
tarobj.add(audio_path, arcname=audio_file)
with tarfile.TarFile(archive_path, "r") as tarobj:
fileobj = tarobj.extractfile(audio_file)
info = soundfile_backend.info(fileobj)
assert info.sample_rate == sample_rate
assert info.num_frames == num_frames
assert info.num_channels == num_channels
assert info.bits_per_sample == bits_per_sample
assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
def test_tarobj_wav(self):
"""Query compressed audio via file-like object works"""
self._test_tarobj("wav", "PCM_16", 16)
@skipIfFormatNotSupported("FLAC")
def test_tarobj_flac(self):
"""Query compressed audio via file-like object works"""
self._test_tarobj("flac", "PCM_16", 16)
if __name__ == '__main__':
unittest.main()

@ -1,369 +0,0 @@
#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/load_test.py
import os
import tarfile
import unittest
from unittest.mock import patch
import numpy as np
from parameterized import parameterized
import paddle
from paddlespeech.audio._internal import module_utils as _mod_utils
from paddlespeech.audio.backends import soundfile_backend
from tests.unit.audio.backends.common import get_bits_per_sample, get_encoding
from tests.unit.common_utils import (
get_wav_data,
load_wav,
nested_params,
normalize_wav,
save_wav,
TempDirMixin,
)
from common import dtype2subtype, parameterize, skipIfFormatNotSupported
import soundfile
def _get_mock_path(
ext: str,
dtype: str,
sample_rate: int,
num_channels: int,
num_frames: int,
):
return f"{dtype}_{sample_rate}_{num_channels}_{num_frames}.{ext}"
def _get_mock_params(path: str):
filename, ext = path.split(".")
parts = filename.split("_")
return {
"ext": ext,
"dtype": parts[0],
"sample_rate": int(parts[1]),
"num_channels": int(parts[2]),
"num_frames": int(parts[3]),
}
class SoundFileMock:
def __init__(self, path, mode):
assert mode == "r"
self.path = path
self._params = _get_mock_params(path)
self._start = None
@property
def samplerate(self):
return self._params["sample_rate"]
@property
def format(self):
if self._params["ext"] == "wav":
return "WAV"
if self._params["ext"] == "flac":
return "FLAC"
if self._params["ext"] == "ogg":
return "OGG"
if self._params["ext"] in ["sph", "nis", "nist"]:
return "NIST"
@property
def subtype(self):
if self._params["ext"] == "ogg":
return "VORBIS"
return dtype2subtype(self._params["dtype"])
def _prepare_read(self, start, stop, frames):
assert stop is None
self._start = start
return frames
def read(self, frames, dtype, always_2d):
assert always_2d
data = get_wav_data(
dtype,
self._params["num_channels"],
normalize=False,
num_frames=self._params["num_frames"],
channels_first=False,
).numpy()
return data[self._start : self._start + frames]
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
pass
class MockedLoadTest(unittest.TestCase):
def assert_dtype(self, ext, dtype, sample_rate, num_channels, normalize, channels_first):
"""When format is WAV or NIST, normalize=False will return the native dtype Tensor, otherwise float32"""
num_frames = 3 * sample_rate
path = _get_mock_path(ext, dtype, sample_rate, num_channels, num_frames)
expected_dtype = paddle.float32 if normalize or ext not in ["wav", "nist"] else getattr(paddle, dtype)
with patch("soundfile.SoundFile", SoundFileMock):
found, sr = soundfile_backend.load(path, normalize=normalize, channels_first=channels_first)
assert found.dtype == expected_dtype
assert sample_rate == sr
@parameterize(
["int32", "float32", "float64"],
[8000, 16000],
[1, 2],
[True, False],
[True, False],
)
def test_wav(self, dtype, sample_rate, num_channels, normalize, channels_first):
"""Returns native dtype when normalize=False else float32"""
self.assert_dtype("wav", dtype, sample_rate, num_channels, normalize, channels_first)
@parameterize(
["int32"],
[8000, 16000],
[1, 2],
[True, False],
[True, False],
)
def test_sphere(self, dtype, sample_rate, num_channels, normalize, channels_first):
"""Returns float32 always"""
self.assert_dtype("sph", dtype, sample_rate, num_channels, normalize, channels_first)
@parameterize([8000, 16000], [1, 2], [True, False], [True, False])
def test_ogg(self, sample_rate, num_channels, normalize, channels_first):
"""Returns float32 always"""
self.assert_dtype("ogg", "int16", sample_rate, num_channels, normalize, channels_first)
@parameterize([8000, 16000], [1, 2], [True, False], [True, False])
def test_flac(self, sample_rate, num_channels, normalize, channels_first):
"""`soundfile_backend.load` can load ogg format."""
self.assert_dtype("flac", "int16", sample_rate, num_channels, normalize, channels_first)
class LoadTestBase(TempDirMixin, unittest.TestCase):
def assert_wav(
self,
dtype,
sample_rate,
num_channels,
normalize,
channels_first=True,
duration=1,
):
"""`soundfile_backend.load` can load wav format correctly.
Wav data loaded with soundfile backend should match those with scipy
"""
path = self.get_temp_path("reference.wav")
num_frames = duration * sample_rate
data = get_wav_data(
dtype,
num_channels,
normalize=normalize,
num_frames=num_frames,
channels_first=channels_first,
)
save_wav(path, data, sample_rate, channels_first=channels_first)
expected = load_wav(path, normalize=normalize, channels_first=channels_first)[0]
data, sr = soundfile_backend.load(path, normalize=normalize, channels_first=channels_first)
assert sr == sample_rate
np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
def assert_sphere(
self,
dtype,
sample_rate,
num_channels,
channels_first=True,
duration=1,
):
"""`soundfile_backend.load` can load SPHERE format correctly."""
path = self.get_temp_path("reference.sph")
num_frames = duration * sample_rate
raw = get_wav_data(
dtype,
num_channels,
num_frames=num_frames,
normalize=False,
channels_first=False,
)
soundfile.write(path, raw, sample_rate, subtype=dtype2subtype(dtype), format="NIST")
expected = normalize_wav(raw.t() if channels_first else raw)
data, sr = soundfile_backend.load(path, channels_first=channels_first)
assert sr == sample_rate
#self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
def assert_flac(
self,
dtype,
sample_rate,
num_channels,
channels_first=True,
duration=1,
):
"""`soundfile_backend.load` can load FLAC format correctly."""
path = self.get_temp_path("reference.flac")
num_frames = duration * sample_rate
raw = get_wav_data(
dtype,
num_channels,
num_frames=num_frames,
normalize=False,
channels_first=False,
)
soundfile.write(path, raw, sample_rate)
expected = normalize_wav(raw.t() if channels_first else raw)
data, sr = soundfile_backend.load(path, channels_first=channels_first)
assert sr == sample_rate
#self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
class TestLoad(LoadTestBase):
"""Test the correctness of `soundfile_backend.load` for various formats"""
@parameterize(
["float32", "int32"],
[8000, 16000],
[1, 2],
[False, True],
[False, True],
)
def test_wav(self, dtype, sample_rate, num_channels, normalize, channels_first):
"""`soundfile_backend.load` can load wav format correctly."""
self.assert_wav(dtype, sample_rate, num_channels, normalize, channels_first)
@parameterize(
["int32"],
[16000],
[2],
[False],
)
def test_wav_large(self, dtype, sample_rate, num_channels, normalize):
"""`soundfile_backend.load` can load large wav file correctly."""
two_hours = 2 * 60 * 60
self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=two_hours)
@parameterize(["float32", "int32"], [4, 8, 16, 32], [False, True])
def test_multiple_channels(self, dtype, num_channels, channels_first):
"""`soundfile_backend.load` can load wav file with more than 2 channels."""
sample_rate = 8000
normalize = False
self.assert_wav(dtype, sample_rate, num_channels, normalize, channels_first)
#@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
#@skipIfFormatNotSupported("NIST")
#def test_sphere(self, dtype, sample_rate, num_channels, channels_first):
#"""`soundfile_backend.load` can load sphere format correctly."""
#self.assert_sphere(dtype, sample_rate, num_channels, channels_first)
#@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
#@skipIfFormatNotSupported("FLAC")
#def test_flac(self, dtype, sample_rate, num_channels, channels_first):
#"""`soundfile_backend.load` can load flac format correctly."""
#self.assert_flac(dtype, sample_rate, num_channels, channels_first)
class TestLoadFormat(TempDirMixin, unittest.TestCase):
"""Given `format` parameter, `so.load` can load files without extension"""
original = None
path = None
def _make_file(self, format_):
sample_rate = 8000
path_with_ext = self.get_temp_path(f"test.{format_}")
data = get_wav_data("float32", num_channels=2).numpy().T
soundfile.write(path_with_ext, data, sample_rate)
expected = soundfile.read(path_with_ext, dtype="float32")[0].T
path = os.path.splitext(path_with_ext)[0]
os.rename(path_with_ext, path)
return path, expected
def _test_format(self, format_):
"""Providing format allows to read file without extension"""
path, expected = self._make_file(format_)
found, _ = soundfile_backend.load(path)
#self.assertEqual(found, expected)
np.testing.assert_array_almost_equal(found, expected)
@parameterized.expand(
[
("WAV",),
("wav",),
]
)
def test_wav(self, format_):
self._test_format(format_)
@parameterized.expand(
[
("FLAC",),
("flac",),
]
)
@skipIfFormatNotSupported("FLAC")
def test_flac(self, format_):
self._test_format(format_)
class TestFileObject(TempDirMixin, unittest.TestCase):
def _test_fileobj(self, ext):
"""Loading audio via file-like object works"""
sample_rate = 16000
path = self.get_temp_path(f"test.{ext}")
data = get_wav_data("float32", num_channels=2).numpy().T
soundfile.write(path, data, sample_rate)
expected = soundfile.read(path, dtype="float32")[0].T
with open(path, "rb") as fileobj:
found, sr = soundfile_backend.load(fileobj)
assert sr == sample_rate
#self.assertEqual(expected, found)
np.testing.assert_array_almost_equal(found, expected)
def test_fileobj_wav(self):
"""Loading audio via file-like object works"""
self._test_fileobj("wav")
def test_fileobj_flac(self):
"""Loading audio via file-like object works"""
self._test_fileobj("flac")
def _test_tarfile(self, ext):
"""Loading audio via file-like object works"""
sample_rate = 16000
audio_file = f"test.{ext}"
audio_path = self.get_temp_path(audio_file)
archive_path = self.get_temp_path("archive.tar.gz")
data = get_wav_data("float32", num_channels=2).numpy().T
soundfile.write(audio_path, data, sample_rate)
expected = soundfile.read(audio_path, dtype="float32")[0].T
with tarfile.TarFile(archive_path, "w") as tarobj:
tarobj.add(audio_path, arcname=audio_file)
with tarfile.TarFile(archive_path, "r") as tarobj:
fileobj = tarobj.extractfile(audio_file)
found, sr = soundfile_backend.load(fileobj)
assert sr == sample_rate
#self.assertEqual(expected, found)
np.testing.assert_array_almost_equal(found.numpy(), expected)
def test_tarfile_wav(self):
"""Loading audio via file-like object works"""
self._test_tarfile("wav")
def test_tarfile_flac(self):
"""Loading audio via file-like object works"""
self._test_tarfile("flac")
if __name__ == '__main__':
unittest.main()

@ -1,322 +0,0 @@
import io
import unittest
from unittest.mock import patch
from paddlespeech.audio._internal import module_utils as _mod_utils
from paddlespeech.audio.backends import soundfile_backend
from tests.unit.common_utils import (
get_wav_data,
load_wav,
nested_params,
normalize_wav,
save_wav,
TempDirMixin,
)
from common import fetch_wav_subtype, parameterize, skipIfFormatNotSupported
import paddle
import numpy as np
import soundfile
class MockedSaveTest(unittest.TestCase):
@nested_params(
["float32", "int32"],
[8000, 16000],
[1, 2],
[False, True],
[
(None, None),
("PCM_U", None),
("PCM_U", 8),
("PCM_S", None),
("PCM_S", 16),
("PCM_S", 32),
("PCM_F", None),
("PCM_F", 32),
("PCM_F", 64),
("ULAW", None),
("ULAW", 8),
("ALAW", None),
("ALAW", 8),
],
)
@patch("soundfile.write")
def test_wav(self, dtype, sample_rate, num_channels, channels_first, enc_params, mocked_write):
"""soundfile_backend.save passes correct subtype to soundfile.write when WAV"""
filepath = "foo.wav"
input_tensor = get_wav_data(
dtype,
num_channels,
num_frames=3 * sample_rate,
normalize=dtype == "float32",
channels_first=channels_first,
)
input_tensor = paddle.transpose(input_tensor, [1, 0])
encoding, bits_per_sample = enc_params
soundfile_backend.save(
filepath,
input_tensor,
sample_rate,
channels_first=channels_first,
encoding=encoding,
bits_per_sample=bits_per_sample,
)
# on +Py3.8 call_args.kwargs is more descreptive
args = mocked_write.call_args[1]
assert args["file"] == filepath
assert args["samplerate"] == sample_rate
assert args["subtype"] == fetch_wav_subtype(dtype, encoding, bits_per_sample)
assert args["format"] is None
tensor_result = paddle.transpose(input_tensor, [1, 0]) if channels_first else input_tensor
#self.assertEqual(args["data"], tensor_result.numpy())
np.testing.assert_array_almost_equal(args["data"].numpy(), tensor_result.numpy())
@patch("soundfile.write")
def assert_non_wav(
self,
fmt,
dtype,
sample_rate,
num_channels,
channels_first,
mocked_write,
encoding=None,
bits_per_sample=None,
):
"""soundfile_backend.save passes correct subtype and format to soundfile.write when SPHERE"""
filepath = f"foo.{fmt}"
input_tensor = get_wav_data(
dtype,
num_channels,
num_frames=3 * sample_rate,
normalize=False,
channels_first=channels_first,
)
input_tensor = paddle.transpose(input_tensor, [1, 0])
expected_data = paddle.transpose(input_tensor, [1, 0]) if channels_first else input_tensor
soundfile_backend.save(
filepath,
input_tensor,
sample_rate,
channels_first,
encoding=encoding,
bits_per_sample=bits_per_sample,
)
# on +Py3.8 call_args.kwargs is more descreptive
args = mocked_write.call_args[1]
assert args["file"] == filepath
assert args["samplerate"] == sample_rate
if fmt in ["sph", "nist", "nis"]:
assert args["format"] == "NIST"
else:
assert args["format"] is None
np.testing.assert_array_almost_equal(args["data"].numpy(), expected_data.numpy())
#self.assertEqual(args["data"], expected_data)
@nested_params(
["sph", "nist", "nis"],
["int32"],
[8000, 16000],
[1, 2],
[False, True],
[
("PCM_S", 8),
("PCM_S", 16),
("PCM_S", 24),
("PCM_S", 32),
("ULAW", 8),
("ALAW", 8),
("ALAW", 16),
("ALAW", 24),
("ALAW", 32),
],
)
def test_sph(self, fmt, dtype, sample_rate, num_channels, channels_first, enc_params):
"""soundfile_backend.save passes default format and subtype (None-s) to
soundfile.write when not WAV"""
encoding, bits_per_sample = enc_params
self.assert_non_wav(
fmt, dtype, sample_rate, num_channels, channels_first, encoding=encoding, bits_per_sample=bits_per_sample
)
@parameterize(
["int32"],
[8000, 16000],
[1, 2],
[False, True],
[8, 16, 24],
)
def test_flac(self, dtype, sample_rate, num_channels, channels_first, bits_per_sample):
"""soundfile_backend.save passes default format and subtype (None-s) to
soundfile.write when not WAV"""
self.assert_non_wav("flac", dtype, sample_rate, num_channels, channels_first, bits_per_sample=bits_per_sample)
@parameterize(
["int32"],
[8000, 16000],
[1, 2],
[False, True],
)
def test_ogg(self, dtype, sample_rate, num_channels, channels_first):
"""soundfile_backend.save passes default format and subtype (None-s) to
soundfile.write when not WAV"""
self.assert_non_wav("ogg", dtype, sample_rate, num_channels, channels_first)
class SaveTestBase(TempDirMixin, unittest.TestCase):
def assert_wav(self, dtype, sample_rate, num_channels, num_frames):
"""`soundfile_backend.save` can save wav format."""
path = self.get_temp_path("data.wav")
expected = get_wav_data(dtype, num_channels, num_frames=num_frames, normalize=False)
soundfile_backend.save(path, expected, sample_rate)
found, sr = load_wav(path, normalize=False)
assert sample_rate == sr
#self.assertEqual(found, expected)
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
def _assert_non_wav(self, fmt, dtype, sample_rate, num_channels):
"""`soundfile_backend.save` can save non-wav format.
Due to precision missmatch, and the lack of alternative way to decode the
resulting files without using soundfile, only meta data are validated.
"""
num_frames = sample_rate * 3
path = self.get_temp_path(f"data.{fmt}")
expected = get_wav_data(dtype, num_channels, num_frames=num_frames, normalize=False)
soundfile_backend.save(path, expected, sample_rate)
sinfo = soundfile.info(path)
assert sinfo.format == fmt.upper()
#assert sinfo.frames == num_frames this go wrong
assert sinfo.channels == num_channels
assert sinfo.samplerate == sample_rate
def assert_flac(self, dtype, sample_rate, num_channels):
"""`soundfile_backend.save` can save flac format."""
self._assert_non_wav("flac", dtype, sample_rate, num_channels)
def assert_sphere(self, dtype, sample_rate, num_channels):
"""`soundfile_backend.save` can save sph format."""
self._assert_non_wav("nist", dtype, sample_rate, num_channels)
def assert_ogg(self, dtype, sample_rate, num_channels):
"""`soundfile_backend.save` can save ogg format.
As we cannot inspect the OGG format (it's lossy), we only check the metadata.
"""
self._assert_non_wav("ogg", dtype, sample_rate, num_channels)
class TestSave(SaveTestBase):
@parameterize(
["float32", "int32"],
[8000, 16000],
[1, 2],
)
def test_wav(self, dtype, sample_rate, num_channels):
"""`soundfile_backend.save` can save wav format."""
self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
@parameterize(
["float32", "int32"],
[4, 8, 16, 32],
)
def test_multiple_channels(self, dtype, num_channels):
"""`soundfile_backend.save` can save wav with more than 2 channels."""
sample_rate = 8000
self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
@parameterize(
["int32"],
[8000, 16000],
[1, 2],
)
@skipIfFormatNotSupported("NIST")
def test_sphere(self, dtype, sample_rate, num_channels):
"""`soundfile_backend.save` can save sph format."""
self.assert_sphere(dtype, sample_rate, num_channels)
@parameterize(
[8000, 16000],
[1, 2],
)
@skipIfFormatNotSupported("FLAC")
def test_flac(self, sample_rate, num_channels):
"""`soundfile_backend.save` can save flac format."""
self.assert_flac("float32", sample_rate, num_channels)
@parameterize(
[8000, 16000],
[1, 2],
)
@skipIfFormatNotSupported("OGG")
def test_ogg(self, sample_rate, num_channels):
"""`soundfile_backend.save` can save ogg/vorbis format."""
self.assert_ogg("float32", sample_rate, num_channels)
class TestSaveParams(TempDirMixin, unittest.TestCase):
"""Test the correctness of optional parameters of `soundfile_backend.save`"""
@parameterize([True, False])
def test_channels_first(self, channels_first):
"""channels_first swaps axes"""
path = self.get_temp_path("data.wav")
data = get_wav_data("int32", 2, channels_first=channels_first)
soundfile_backend.save(path, data, 8000, channels_first=channels_first)
found = load_wav(path)[0]
expected = data if channels_first else data.transpose([1, 0])
#self.assertEqual(found, expected, atol=1e-4, rtol=1e-8)
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
class TestFileObject(TempDirMixin, unittest.TestCase):
def _test_fileobj(self, ext):
"""Saving audio to file-like object works"""
sample_rate = 16000
path = self.get_temp_path(f"test.{ext}")
subtype = "FLOAT" if ext == "wav" else None
data = get_wav_data("float32", num_channels=2)
soundfile.write(path, data.numpy().T, sample_rate, subtype=subtype)
expected = soundfile.read(path, dtype="float32")[0]
fileobj = io.BytesIO()
soundfile_backend.save(fileobj, data, sample_rate, format=ext)
fileobj.seek(0)
found, sr = soundfile.read(fileobj, dtype="float32")
assert sr == sample_rate
#self.assertEqual(expected, found, atol=1e-4, rtol=1e-8)
np.testing.assert_array_almost_equal(found, expected)
def test_fileobj_wav(self):
"""Saving audio via file-like object works"""
self._test_fileobj("wav")
@skipIfFormatNotSupported("FLAC")
def test_fileobj_flac(self):
"""Saving audio via file-like object works"""
self._test_fileobj("flac")
@skipIfFormatNotSupported("NIST")
def test_fileobj_nist(self):
"""Saving audio via file-like object works"""
self._test_fileobj("NIST")
@skipIfFormatNotSupported("OGG")
def test_fileobj_ogg(self):
"""Saving audio via file-like object works"""
self._test_fileobj("OGG")
if __name__ == '__main__':
unittest.main()

@ -1,73 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import filecmp
import os
import unittest
import numpy as np
import soundfile as sf
import paddlespeech.audio
from ..base import BackendTest
class TestIO(BackendTest):
def test_load_mono_channel(self):
sf_data, sf_sr = sf.read(self.files[0])
pa_data, pa_sr = paddlespeech.audio.load(
self.files[0], normal=False, dtype='float64')
self.assertEqual(sf_data.dtype, pa_data.dtype)
self.assertEqual(sf_sr, pa_sr)
np.testing.assert_array_almost_equal(sf_data, pa_data)
def test_load_multi_channels(self):
sf_data, sf_sr = sf.read(self.files[1])
sf_data = sf_data.T # Channel dim first
pa_data, pa_sr = paddlespeech.audio.load(
self.files[1], mono=False, normal=False, dtype='float64')
self.assertEqual(sf_data.dtype, pa_data.dtype)
self.assertEqual(sf_sr, pa_sr)
np.testing.assert_array_almost_equal(sf_data, pa_data)
def test_save_mono_channel(self):
waveform, sr = np.random.randint(
low=-32768, high=32768, size=(48000), dtype=np.int16), 16000
sf_tmp_file = 'sf_tmp.wav'
pa_tmp_file = 'pa_tmp.wav'
sf.write(sf_tmp_file, waveform, sr)
paddlespeech.audio.save(waveform, sr, pa_tmp_file)
self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
for file in [sf_tmp_file, pa_tmp_file]:
os.remove(file)
def test_save_multi_channels(self):
waveform, sr = np.random.randint(
low=-32768, high=32768, size=(2, 48000), dtype=np.int16), 16000
sf_tmp_file = 'sf_tmp.wav'
pa_tmp_file = 'pa_tmp.wav'
sf.write(sf_tmp_file, waveform.T, sr)
paddlespeech.audio.save(waveform.T, sr, pa_tmp_file)
self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
for file in [sf_tmp_file, pa_tmp_file]:
os.remove(file)
if __name__ == '__main__':
unittest.main()

@ -1,289 +0,0 @@
import unittest
import itertools
import tarfile
from contextlib import contextmanager
import numpy as np
import paddle
import os
import io
from parameterized import parameterized
from tests.unit.audio.backends.common import get_bits_per_sample, get_encoding
from paddlespeech.audio.backends import sox_io_backend
from tests.unit.common_utils import (
get_wav_data,
load_wav,
save_wav,
TempDirMixin,
sox_utils,
data_utils
)
#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/info_test.py
class TestInfo(TempDirMixin, unittest.TestCase):
@parameterized.expand(
list(
itertools.product(
["float32", "int32",],
[8000, 16000],
[1, 2],
)
),
)
def test_wav(self, dtype, sample_rate, num_channels):
"""`sox_io_backend.info` can check wav file correctly"""
duration = 1
path = self.get_temp_path("data.wav")
data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate)
save_wav(path, data, sample_rate)
info = sox_io_backend.info(path)
assert info.sample_rate == sample_rate
assert info.num_frames == sample_rate * duration
assert info.num_channels == num_channels
assert info.bits_per_sample == sox_utils.get_bit_depth(dtype)
assert info.encoding == get_encoding("wav", dtype)
@parameterized.expand(
list(
itertools.product(
["float32", "int32"],
[8000, 16000],
[4, 8, 16, 32],
)
),
)
def test_wav_multiple_channels(self, dtype, sample_rate, num_channels):
"""`sox_io_backend.info` can check wav file with channels more than 2 correctly"""
duration = 1
path = self.get_temp_path("data.wav")
data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate)
save_wav(path, data, sample_rate)
info = sox_io_backend.info(path)
assert info.sample_rate == sample_rate
assert info.num_frames == sample_rate * duration
assert info.num_channels == num_channels
assert info.bits_per_sample == sox_utils.get_bit_depth(dtype)
def test_ulaw(self):
"""`sox_io_backend.info` can check ulaw file correctly"""
duration = 1
num_channels = 1
sample_rate = 8000
path = self.get_temp_path("data.wav")
sox_utils.gen_audio_file(
path, sample_rate=sample_rate, num_channels=num_channels, bit_depth=8, encoding="u-law", duration=duration
)
info = sox_io_backend.info(path)
assert info.sample_rate == sample_rate
assert info.num_frames == sample_rate * duration
assert info.num_channels == num_channels
assert info.bits_per_sample == 8
assert info.encoding == "ULAW"
def test_alaw(self):
"""`sox_io_backend.info` can check alaw file correctly"""
duration = 1
num_channels = 1
sample_rate = 8000
path = self.get_temp_path("data.wav")
sox_utils.gen_audio_file(
path, sample_rate=sample_rate, num_channels=num_channels, bit_depth=8, encoding="a-law", duration=duration
)
info = sox_io_backend.info(path)
assert info.sample_rate == sample_rate
assert info.num_frames == sample_rate * duration
assert info.num_channels == num_channels
assert info.bits_per_sample == 8
assert info.encoding == "ALAW"
#class TestInfoOpus(unittest.TestCase):
#@parameterized.expand(
#list(
#itertools.product(
#["96k"],
#[1, 2],
#[0, 5, 10],
#)
#),
#)
#def test_opus(self, bitrate, num_channels, compression_level):
#"""`sox_io_backend.info` can check opus file correcty"""
#path = data_utils.get_asset_path("io", f"{bitrate}_{compression_level}_{num_channels}ch.opus")
#info = sox_io_backend.info(path)
#assert info.sample_rate == 48000
#assert info.num_frames == 32768
#assert info.num_channels == num_channels
#assert info.bits_per_sample == 0 # bit_per_sample is irrelevant for compressed formats
#assert info.encoding == "OPUS"
class FileObjTestBase(TempDirMixin):
def _gen_file(self, ext, dtype, sample_rate, num_channels, num_frames, *, comments=None):
path = self.get_temp_path(f"test.{ext}")
bit_depth = sox_utils.get_bit_depth(dtype)
duration = num_frames / sample_rate
comment_file = self._gen_comment_file(comments) if comments else None
sox_utils.gen_audio_file(
path,
sample_rate,
num_channels=num_channels,
encoding=sox_utils.get_encoding(dtype),
bit_depth=bit_depth,
duration=duration,
comment_file=comment_file,
)
return path
def _gen_comment_file(self, comments):
comment_path = self.get_temp_path("comment.txt")
with open(comment_path, "w") as file_:
file_.writelines(comments)
return comment_path
class Unseekable:
def __init__(self, fileobj):
self.fileobj = fileobj
def read(self, n):
return self.fileobj.read(n)
class TestFileObject(FileObjTestBase, unittest.TestCase):
def _query_fileobj(self, ext, dtype, sample_rate, num_channels, num_frames, *, comments=None):
path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames, comments=comments)
format_ = ext if ext in ["mp3"] else None
with open(path, "rb") as fileobj:
return sox_io_backend.info(fileobj, format_)
def _query_bytesio(self, ext, dtype, sample_rate, num_channels, num_frames):
path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames)
format_ = ext if ext in ["mp3"] else None
with open(path, "rb") as file_:
fileobj = io.BytesIO(file_.read())
return sox_io_backend.info(fileobj, format_)
def _query_tarfile(self, ext, dtype, sample_rate, num_channels, num_frames):
audio_path = self._gen_file(ext, dtype, sample_rate, num_channels, num_frames)
audio_file = os.path.basename(audio_path)
archive_path = self.get_temp_path("archive.tar.gz")
with tarfile.TarFile(archive_path, "w") as tarobj:
tarobj.add(audio_path, arcname=audio_file)
format_ = ext if ext in ["mp3"] else None
with tarfile.TarFile(archive_path, "r") as tarobj:
fileobj = tarobj.extractfile(audio_file)
return sox_io_backend.info(fileobj, format_)
@contextmanager
def _set_buffer_size(self, buffer_size):
try:
original_buffer_size = get_buffer_size()
set_buffer_size(buffer_size)
yield
finally:
set_buffer_size(original_buffer_size)
@parameterized.expand(
[
("wav", "float32"),
("wav", "int32"),
("wav", "int16"),
("wav", "uint8"),
]
)
def test_fileobj(self, ext, dtype):
"""Querying audio via file object works"""
sample_rate = 16000
num_frames = 3 * sample_rate
num_channels = 2
sinfo = self._query_fileobj(ext, dtype, sample_rate, num_channels, num_frames)
bits_per_sample = get_bits_per_sample(ext, dtype)
num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
assert sinfo.sample_rate == sample_rate
assert sinfo.num_channels == num_channels
assert sinfo.num_frames == num_frames
assert sinfo.bits_per_sample == bits_per_sample
assert sinfo.encoding == get_encoding(ext, dtype)
@parameterized.expand(
[
("wav", "float32"),
("wav", "int32"),
("wav", "int16"),
("wav", "uint8"),
]
)
def test_bytesio(self, ext, dtype):
"""Querying audio via ByteIO object works for small data"""
sample_rate = 16000
num_frames = 3 * sample_rate
num_channels = 2
sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels, num_frames)
bits_per_sample = get_bits_per_sample(ext, dtype)
num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
assert sinfo.sample_rate == sample_rate
assert sinfo.num_channels == num_channels
assert sinfo.num_frames == num_frames
assert sinfo.bits_per_sample == bits_per_sample
assert sinfo.encoding == get_encoding(ext, dtype)
@parameterized.expand(
[
("wav", "float32"),
("wav", "int32"),
("wav", "int16"),
("wav", "uint8"),
]
)
def test_bytesio_tiny(self, ext, dtype):
"""Querying audio via ByteIO object works for small data"""
sample_rate = 8000
num_frames = 4
num_channels = 2
sinfo = self._query_bytesio(ext, dtype, sample_rate, num_channels, num_frames)
bits_per_sample = get_bits_per_sample(ext, dtype)
num_frames = 0 if ext in ["mp3", "vorbis"] else num_frames
assert sinfo.sample_rate == sample_rate
assert sinfo.num_channels == num_channels
assert sinfo.num_frames == num_frames
assert sinfo.bits_per_sample == bits_per_sample
assert sinfo.encoding == get_encoding(ext, dtype)
@parameterized.expand(
[
("wav", "float32"),
("wav", "int32"),
("wav", "int16"),
("wav", "uint8"),
("flac", "float32"),
("vorbis", "float32"),
("amb", "int16"),
]
)
def test_tarfile(self, ext, dtype):
"""Querying compressed audio via file-like object works"""
sample_rate = 16000
num_frames = 3.0 * sample_rate
num_channels = 2
sinfo = self._query_tarfile(ext, dtype, sample_rate, num_channels, num_frames)
bits_per_sample = get_bits_per_sample(ext, dtype)
num_frames = 0 if ext in ["vorbis"] else num_frames
assert sinfo.sample_rate == sample_rate
assert sinfo.num_channels == num_channels
assert sinfo.num_frames == num_frames
assert sinfo.bits_per_sample == bits_per_sample
assert sinfo.encoding == get_encoding(ext, dtype)
if __name__ == '__main__':
unittest.main()

@ -1,47 +0,0 @@
import unittest
import itertools
from parameterized import parameterized
import numpy as np
from paddlespeech.audio._internal import module_utils as _mod_utils
from paddlespeech.audio.backends import sox_io_backend
from tests.unit.common_utils import (
get_wav_data,
load_wav,
save_wav,
)
#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/load_test.py
class TestLoad(unittest.TestCase):
def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration):
"""`sox_io_backend.load` can load wav format correctly.
Wav data loaded with sox_io backend should match those with scipy
"""
path = 'testdata/reference.wav'
data = get_wav_data(dtype, num_channels, normalize=normalize, num_frames=duration * sample_rate)
save_wav(path, data, sample_rate)
expected = load_wav(path, normalize=normalize)[0]
data, sr = sox_io_backend.load(path, normalize=normalize)
assert sr == sample_rate
np.testing.assert_array_almost_equal(data, expected, decimal=4)
@parameterized.expand(
list(
itertools.product(
["float64", "float32", "int32",],
[8000, 16000],
[1, 2],
[False, True],
)
),
)
def test_wav(self, dtype, sample_rate, num_channels, normalize):
"""`sox_io_backend.load` can load wav format correctly."""
self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=1)
if __name__ == '__main__':
unittest.main()

@ -1,175 +0,0 @@
import io
import os
import unittest
import numpy as np
import paddle
from parameterized import parameterized
from paddlespeech.audio.backends import sox_io_backend
from tests.unit.common_utils import (
get_wav_data,
load_wav,
save_wav,
nested_params,
TempDirMixin,
sox_utils
)
#code is from:https://github.com/pytorch/audio/blob/main/torchaudio/test/torchaudio_unittest/backend/sox_io/save_test.py
def _get_sox_encoding(encoding):
encodings = {
"PCM_F": "floating-point",
"PCM_S": "signed-integer",
"PCM_U": "unsigned-integer",
"ULAW": "u-law",
"ALAW": "a-law",
}
return encodings.get(encoding)
class TestSaveBase(TempDirMixin):
def assert_save_consistency(
self,
format: str,
*,
compression: float = None,
encoding: str = None,
bits_per_sample: int = None,
sample_rate: float = 8000,
num_channels: int = 2,
num_frames: float = 3 * 8000,
src_dtype: str = "int32",
test_mode: str = "path",
):
"""`save` function produces file that is comparable with `sox` command
To compare that the file produced by `save` function agains the file produced by
the equivalent `sox` command, we need to load both files.
But there are many formats that cannot be opened with common Python modules (like
SciPy).
So we use `sox` command to prepare the original data and convert the saved files
into a format that SciPy can read (PCM wav).
The following diagram illustrates this process. The difference is 2.1. and 3.1.
This assumes that
- loading data with SciPy preserves the data well.
- converting the resulting files into WAV format with `sox` preserve the data well.
x
| 1. Generate source wav file with SciPy
|
v
-------------- wav ----------------
| |
| 2.1. load with scipy | 3.1. Convert to the target
| then save it into the target | format depth with sox
| format with paddleaudio |
v v
target format target format
| |
| 2.2. Convert to wav with sox | 3.2. Convert to wav with sox
| |
v v
wav wav
| |
| 2.3. load with scipy | 3.3. load with scipy
| |
v v
tensor -------> compare <--------- tensor
"""
cmp_encoding = "floating-point"
cmp_bit_depth = 32
src_path = self.get_temp_path("1.source.wav")
tgt_path = self.get_temp_path(f"2.1.paddleaudio.{format}")
tst_path = self.get_temp_path("2.2.result.wav")
sox_path = self.get_temp_path(f"3.1.sox.{format}")
ref_path = self.get_temp_path("3.2.ref.wav")
# 1. Generate original wav
data = get_wav_data(src_dtype, num_channels, normalize=False, num_frames=num_frames)
save_wav(src_path, data, sample_rate)
# 2.1. Convert the original wav to target format with paddleaudio
data = load_wav(src_path, normalize=False)[0]
if test_mode == "path":
sox_io_backend.save(
tgt_path, data, sample_rate, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample
)
elif test_mode == "fileobj":
with open(tgt_path, "bw") as file_:
sox_io_backend.save(
file_,
data,
sample_rate,
format=format,
compression=compression,
encoding=encoding,
bits_per_sample=bits_per_sample,
)
elif test_mode == "bytesio":
file_ = io.BytesIO()
sox_io_backend.save(
file_,
data,
sample_rate,
format=format,
compression=compression,
encoding=encoding,
bits_per_sample=bits_per_sample,
)
file_.seek(0)
with open(tgt_path, "bw") as f:
f.write(file_.read())
else:
raise ValueError(f"Unexpected test mode: {test_mode}")
# 2.2. Convert the target format to wav with sox
sox_utils.convert_audio_file(tgt_path, tst_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth)
# 2.3. Load with SciPy
found = load_wav(tst_path, normalize=False)[0]
# 3.1. Convert the original wav to target format with sox
sox_encoding = _get_sox_encoding(encoding)
sox_utils.convert_audio_file(
src_path, sox_path, compression=compression, encoding=sox_encoding, bit_depth=bits_per_sample
)
# 3.2. Convert the target format to wav with sox
sox_utils.convert_audio_file(sox_path, ref_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth)
# 3.3. Load with SciPy
expected = load_wav(ref_path, normalize=False)[0]
np.testing.assert_array_almost_equal(found, expected)
class TestSave(TestSaveBase, unittest.TestCase):
@nested_params(
["path",],
[
("PCM_U", 8),
("PCM_S", 16),
("PCM_S", 32),
("PCM_F", 32),
("PCM_F", 64),
("ULAW", 8),
("ALAW", 8),
],
)
def test_save_wav(self, test_mode, enc_params):
encoding, bits_per_sample = enc_params
self.assert_save_consistency("wav", encoding=encoding, bits_per_sample=bits_per_sample, test_mode=test_mode)
@nested_params(
["path", ],
[
("float32",),
("int32",),
],
)
def test_save_wav_dtype(self, test_mode, params):
(dtype,) = params
self.assert_save_consistency("wav", src_dtype=dtype, test_mode=test_mode)
if __name__ == '__main__':
unittest.main()

@ -1,183 +0,0 @@
import io
import itertools
import unittest
from parameterized import parameterized
from paddlespeech.audio.backends import sox_io_backend
from tests.unit.common_utils import (
get_wav_data,
TempDirMixin,
name_func
)
class SmokeTest(TempDirMixin, unittest.TestCase):
"""Run smoke test on various audio format
The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit
abnormal behaviors.
This test suite should be able to run without any additional tools (such as sox command),
however without such tools, the correctness of each function cannot be verified.
"""
def run_smoke_test(self, ext, sample_rate, num_channels, *, compression=None, dtype="float32"):
duration = 1
num_frames = sample_rate * duration
#path = self.get_temp_path(f"test.{ext}")
path = self.get_temp_path(f"test.{ext}")
original = get_wav_data(dtype, num_channels, normalize=False, num_frames=num_frames)
# 1. run save
sox_io_backend.save(path, original, sample_rate, compression=compression)
# 2. run info
info = sox_io_backend.info(path)
assert info.sample_rate == sample_rate
assert info.num_channels == num_channels
# 3. run load
loaded, sr = sox_io_backend.load(path, normalize=False)
assert sr == sample_rate
assert loaded.shape[0] == num_channels
@parameterized.expand(
list(
itertools.product(
["float32", "int32" ],
#["float32", "int32", "int16", "uint8"],
[8000, 16000],
[1, 2],
)
),
name_func=name_func,
)
def test_wav(self, dtype, sample_rate, num_channels):
"""Run smoke test on wav format"""
self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype)
#@parameterized.expand(
#list(
#itertools.product(
#[8000, 16000],
#[1, 2],
#[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
#)
#)
#)
#def test_mp3(self, sample_rate, num_channels, bit_rate):
#"""Run smoke test on mp3 format"""
#self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate)
#@parameterized.expand(
#list(
#itertools.product(
#[8000, 16000],
#[1, 2],
#[-1, 0, 1, 2, 3, 3.6, 5, 10],
#)
#)
#)
#def test_vorbis(self, sample_rate, num_channels, quality_level):
#"""Run smoke test on vorbis format"""
#self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level)
@parameterized.expand(
list(
itertools.product(
[8000, 16000],
[1, 2],
list(range(9)),
)
),
name_func=name_func,
)
def test_flac(self, sample_rate, num_channels, compression_level):
"""Run smoke test on flac format"""
self.run_smoke_test("flac", sample_rate, num_channels, compression=compression_level)
class SmokeTestFileObj(unittest.TestCase):
"""Run smoke test on various audio format
The purpose of this test suite is to verify that sox_io_backend functionalities do not exhibit
abnormal behaviors.
This test suite should be able to run without any additional tools (such as sox command),
however without such tools, the correctness of each function cannot be verified.
"""
def run_smoke_test(self, ext, sample_rate, num_channels, *, compression=None, dtype="float32"):
duration = 1
num_frames = sample_rate * duration
original = get_wav_data(dtype, num_channels, normalize=False, num_frames=num_frames)
fileobj = io.BytesIO()
# 1. run save
sox_io_backend.save(fileobj, original, sample_rate, compression=compression, format=ext)
# 2. run info
fileobj.seek(0)
info = sox_io_backend.info(fileobj, format=ext)
assert info.sample_rate == sample_rate
assert info.num_channels == num_channels
# 3. run load
fileobj.seek(0)
loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext)
assert sr == sample_rate
assert loaded.shape[0] == num_channels
@parameterized.expand(
list(
itertools.product(
["float32", "int32"],
[8000, 16000],
[1, 2],
)
),
name_func=name_func,
)
def test_wav(self, dtype, sample_rate, num_channels):
"""Run smoke test on wav format"""
self.run_smoke_test("wav", sample_rate, num_channels, dtype=dtype)
# not support yet
#@parameterized.expand(
#list(
#itertools.product(
#[8000, 16000],
#[1, 2],
#[-4.2, -0.2, 0, 0.2, 96, 128, 160, 192, 224, 256, 320],
#)
#)
#)
#def test_mp3(self, sample_rate, num_channels, bit_rate):
#"""Run smoke test on mp3 format"""
#self.run_smoke_test("mp3", sample_rate, num_channels, compression=bit_rate)
#@parameterized.expand(
#list(
#itertools.product(
#[8000, 16000],
#[1, 2],
#[-1, 0, 1, 2, 3, 3.6, 5, 10],
#)
#)
#)
#def test_vorbis(self, sample_rate, num_channels, quality_level):
#"""Run smoke test on vorbis format"""
#self.run_smoke_test("vorbis", sample_rate, num_channels, compression=quality_level)
@parameterized.expand(
list(
itertools.product(
[8000, 16000],
[1, 2],
list(range(9)),
)
),
name_func=name_func,
)
def test_flac(self, sample_rate, num_channels, compression_level):
#"""Run smoke test on flac format"""
self.run_smoke_test("flac", sample_rate, num_channels, compression=compression_level)
if __name__ == '__main__':
#test_func()
unittest.main()

@ -1,347 +0,0 @@
#code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/sox_effect/sox_effect_test.py
import io
import itertools
import tarfile
import unittest
from pathlib import Path
import numpy as np
from parameterized import parameterized
from paddlespeech.audio import sox_effects
from paddlespeech.audio._internal import module_utils as _mod_utils
from tests.unit.common_utils import (
get_sinusoid,
get_wav_data,
load_wav,
save_wav,
sox_utils,
TempDirMixin,
name_func,
load_effects_params
)
if _mod_utils.is_module_available("requests"):
import requests
class TestSoxEffects(unittest.TestCase):
def test_init(self):
"""Calling init_sox_effects multiple times does not crush"""
for _ in range(3):
sox_effects.init_sox_effects()
class TestSoxEffectsTensor(TempDirMixin, unittest.TestCase):
"""Test suite for `apply_effects_tensor` function"""
@parameterized.expand(
list(itertools.product(["float32", "int32"], [8000, 16000], [1, 2, 4, 8], [True, False])),
)
def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first):
"""`apply_effects_tensor` without effects should return identical data as input"""
original = get_wav_data(dtype, num_channels, channels_first=channels_first)
expected = original.clone()
found, output_sample_rate = sox_effects.apply_effects_tensor(expected, sample_rate, [], channels_first)
assert (output_sample_rate == sample_rate)
# SoxEffect should not alter the input Tensor object
#self.assertEqual(original, expected)
np.testing.assert_array_almost_equal(original.numpy(), expected.numpy())
# SoxEffect should not return the same Tensor object
assert expected is not found
# Returned Tensor should equal to the input Tensor
#self.assertEqual(expected, found)
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
@parameterized.expand(
load_effects_params("sox_effect_test_args.jsonl"),
name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
)
def test_apply_effects(self, args):
"""`apply_effects_tensor` should return identical data as sox command"""
effects = args["effects"]
num_channels = args.get("num_channels", 2)
input_sr = args.get("input_sample_rate", 8000)
output_sr = args.get("output_sample_rate")
input_path = self.get_temp_path("input.wav")
reference_path = self.get_temp_path("reference.wav")
original = get_sinusoid(frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype="float32")
save_wav(input_path, original, input_sr)
sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_tensor(original, input_sr, effects)
assert sr == expected_sr
#self.assertEqual(expected, found)
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
class TestSoxEffectsFile(TempDirMixin, unittest.TestCase):
"""Test suite for `apply_effects_file` function"""
@parameterized.expand(
list(
itertools.product(
["float32", "int32"],
[8000, 16000],
[1, 2, 4, 8],
[False, True],
)
),
#name_func=name_func,
)
def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first):
"""`apply_effects_file` without effects should return identical data as input"""
path = self.get_temp_path("input.wav")
expected = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(path, expected, sample_rate, channels_first=channels_first)
found, output_sample_rate = sox_effects.apply_effects_file(
path, [], normalize=False, channels_first=channels_first
)
assert output_sample_rate == sample_rate
#self.assertEqual(expected, found)
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
@parameterized.expand(
load_effects_params("sox_effect_test_args.jsonl"),
#name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
)
def test_apply_effects_str(self, args):
"""`apply_effects_file` should return identical data as sox command"""
dtype = "int32"
channels_first = True
effects = args["effects"]
num_channels = args.get("num_channels", 2)
input_sr = args.get("input_sample_rate", 8000)
output_sr = args.get("output_sample_rate")
input_path = self.get_temp_path("input.wav")
reference_path = self.get_temp_path("reference.wav")
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(input_path, data, input_sr, channels_first=channels_first)
sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_file(input_path, effects, normalize=False, channels_first=channels_first)
assert sr == expected_sr
#self.assertEqual(found, expected)
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
def test_apply_effects_path(self):
"""`apply_effects_file` should return identical data as sox command when file path is given as a Path Object"""
dtype = "int32"
channels_first = True
effects = [["hilbert"]]
num_channels = 2
input_sr = 8000
output_sr = 8000
input_path = self.get_temp_path("input.wav")
reference_path = self.get_temp_path("reference.wav")
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(input_path, data, input_sr, channels_first=channels_first)
sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_file(
Path(input_path), effects, normalize=False, channels_first=channels_first
)
assert sr == expected_sr
#self.assertEqual(found, expected)
np.testing.assert_array_almost_equal(expected.numpy(), found.numpy())
class TestFileFormats(TempDirMixin, unittest.TestCase):
"""`apply_effects_file` gives the same result as sox on various file formats"""
@parameterized.expand(
list(
itertools.product(
["float32", "int32"],
[8000, 16000],
[1, 2],
)
),
#name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
)
def test_wav(self, dtype, sample_rate, num_channels):
"""`apply_effects_file` works on various wav format"""
channels_first = True
effects = [["band", "300", "10"]]
input_path = self.get_temp_path("input.wav")
reference_path = self.get_temp_path("reference.wav")
data = get_wav_data(dtype, num_channels, channels_first=channels_first)
save_wav(input_path, data, sample_rate, channels_first=channels_first)
sox_utils.run_sox_effect(input_path, reference_path, effects)
expected, expected_sr = load_wav(reference_path)
found, sr = sox_effects.apply_effects_file(input_path, effects, normalize=False, channels_first=channels_first)
assert sr == expected_sr
#self.assertEqual(found, expected)
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
#not support now
#@parameterized.expand(
#list(
#itertools.product(
#[8000, 16000],
#[1, 2],
#)
#),
##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
#)
#def test_flac(self, sample_rate, num_channels):
#"""`apply_effects_file` works on various flac format"""
#channels_first = True
#effects = [["band", "300", "10"]]
#input_path = self.get_temp_path("input.flac")
#reference_path = self.get_temp_path("reference.wav")
#sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
#sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
#expected, expected_sr = load_wav(reference_path)
#found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first)
#save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
#assert sr == expected_sr
##self.assertEqual(found, expected)
#np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
#@parameterized.expand(
#list(
#itertools.product(
#[8000, 16000],
#[1, 2],
#)
#),
##name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}',
#)
#def test_vorbis(self, sample_rate, num_channels):
#"""`apply_effects_file` works on various vorbis format"""
#channels_first = True
#effects = [["band", "300", "10"]]
#input_path = self.get_temp_path("input.vorbis")
#reference_path = self.get_temp_path("reference.wav")
#sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
#sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
#expected, expected_sr = load_wav(reference_path)
#found, sr = sox_effects.apply_effects_file(input_path, effects, channels_first=channels_first)
#save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
#assert sr == expected_sr
##self.assertEqual(found, expected)
#np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
#@skipIfNoExec("sox")
#@skipIfNoSox
class TestFileObject(TempDirMixin, unittest.TestCase):
@parameterized.expand(
[
("wav", None),
]
)
def test_fileobj(self, ext, compression):
"""Applying effects via file object works"""
sample_rate = 16000
channels_first = True
effects = [["band", "300", "10"]]
input_path = self.get_temp_path(f"input.{ext}")
reference_path = self.get_temp_path("reference.wav")
#sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
data = get_wav_data("int32", 2, channels_first=channels_first)
save_wav(input_path, data, sample_rate, channels_first=channels_first)
sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
expected, expected_sr = load_wav(reference_path)
with open(input_path, "rb") as fileobj:
found, sr = sox_effects.apply_effects_file(fileobj, effects, channels_first=channels_first)
save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
assert sr == expected_sr
#self.assertEqual(found, expected)
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
@parameterized.expand(
[
("wav", None),
]
)
def test_bytesio(self, ext, compression):
"""Applying effects via BytesIO object works"""
sample_rate = 16000
channels_first = True
effects = [["band", "300", "10"]]
input_path = self.get_temp_path(f"input.{ext}")
reference_path = self.get_temp_path("reference.wav")
#sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
data = get_wav_data("int32", 2, channels_first=channels_first)
save_wav(input_path, data, sample_rate, channels_first=channels_first)
sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
expected, expected_sr = load_wav(reference_path)
with open(input_path, "rb") as file_:
fileobj = io.BytesIO(file_.read())
found, sr = sox_effects.apply_effects_file(fileobj, effects, channels_first=channels_first)
save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
assert sr == expected_sr
#self.assertEqual(found, expected)
print("found")
print(found)
print("expected")
print(expected)
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
@parameterized.expand(
[
("wav", None),
]
)
def test_tarfile(self, ext, compression):
"""Applying effects to compressed audio via file-like file works"""
sample_rate = 16000
channels_first = True
effects = [["band", "300", "10"]]
audio_file = f"input.{ext}"
input_path = self.get_temp_path(audio_file)
reference_path = self.get_temp_path("reference.wav")
archive_path = self.get_temp_path("archive.tar.gz")
data = get_wav_data("int32", 2, channels_first=channels_first)
save_wav(input_path, data, sample_rate, channels_first=channels_first)
# sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression)
sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
expected, expected_sr = load_wav(reference_path)
with tarfile.TarFile(archive_path, "w") as tarobj:
tarobj.add(input_path, arcname=audio_file)
with tarfile.TarFile(archive_path, "r") as tarobj:
fileobj = tarobj.extractfile(audio_file)
found, sr = sox_effects.apply_effects_file(fileobj, effects, channels_first=channels_first)
save_wav(self.get_temp_path("result.wav"), found, sr, channels_first=channels_first)
assert sr == expected_sr
#self.assertEqual(found, expected)
np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
if __name__ == '__main__':
unittest.main()

@ -17,8 +17,7 @@ import urllib.request
import numpy as np
import paddle
from paddlespeech.audio.soundfile_backend import soundfile_load as load
from paddleaudio.backends import soundfile_load as load
wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'

@ -15,9 +15,9 @@ import unittest
import numpy as np
import paddle
from paddleaudio.functional.window import get_window
from .base import FeatTest
from paddlespeech.audio.functional.window import get_window
from paddlespeech.s2t.transform.spectrogram import IStft
from paddlespeech.s2t.transform.spectrogram import Stft

@ -1,81 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
import torch
import torchaudio
import paddlespeech.audio
from .base import FeatTest
class TestKaldi(FeatTest):
def initParmas(self):
self.window_size = 1024
self.dtype = 'float32'
def test_window(self):
t_hann_window = torch.hann_window(
self.window_size, periodic=False, dtype=eval(f'torch.{self.dtype}'))
t_hamm_window = torch.hamming_window(
self.window_size,
periodic=False,
alpha=0.54,
beta=0.46,
dtype=eval(f'torch.{self.dtype}'))
t_povey_window = torch.hann_window(
self.window_size, periodic=False,
dtype=eval(f'torch.{self.dtype}')).pow(0.85)
p_hann_window = paddlespeech.audio.functional.window.get_window(
'hann',
self.window_size,
fftbins=False,
dtype=eval(f'paddle.{self.dtype}'))
p_hamm_window = paddlespeech.audio.functional.window.get_window(
'hamming',
self.window_size,
fftbins=False,
dtype=eval(f'paddle.{self.dtype}'))
p_povey_window = paddlespeech.audio.functional.window.get_window(
'hann',
self.window_size,
fftbins=False,
dtype=eval(f'paddle.{self.dtype}')).pow(0.85)
np.testing.assert_array_almost_equal(t_hann_window, p_hann_window)
np.testing.assert_array_almost_equal(t_hamm_window, p_hamm_window)
np.testing.assert_array_almost_equal(t_povey_window, p_povey_window)
def test_fbank(self):
ta_features = torchaudio.compliance.kaldi.fbank(
torch.from_numpy(self.waveform.astype(self.dtype)))
pa_features = paddlespeech.audio.compliance.kaldi.fbank(
paddle.to_tensor(self.waveform.astype(self.dtype)))
np.testing.assert_array_almost_equal(
ta_features, pa_features, decimal=4)
def test_mfcc(self):
ta_features = torchaudio.compliance.kaldi.mfcc(
torch.from_numpy(self.waveform.astype(self.dtype)))
pa_features = paddlespeech.audio.compliance.kaldi.mfcc(
paddle.to_tensor(self.waveform.astype(self.dtype)))
np.testing.assert_array_almost_equal(
ta_features, pa_features, decimal=4)
if __name__ == '__main__':
unittest.main()

@ -1,58 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
from paddlespeech.audio.kaldi import fbank as fbank
from paddlespeech.audio.kaldi import pitch as pitch
from kaldiio import ReadHelper
# the groundtruth feats computed in kaldi command below.
#compute-fbank-feats --dither=0 scp:$wav_scp ark,t:fbank_feat.ark
#compute-kaldi-pitch-feats --sample-frequency=16000 scp:$wav_scp ark,t:pitch_feat.ark
class TestKaldiFbank(unittest.TestCase):
def test_fbank(self):
fbank_groundtruth = {}
with ReadHelper('ark:testdata/fbank_feat.ark') as reader:
for key, feat in reader:
fbank_groundtruth[key] = feat
with ReadHelper('ark:testdata/wav.ark') as reader:
for key, wav in reader:
fbank_feat = fbank(wav)
fbank_check = fbank_groundtruth[key]
np.testing.assert_array_almost_equal(
fbank_feat, fbank_check, decimal=4)
def test_pitch(self):
pitch_groundtruth = {}
with ReadHelper('ark:testdata/pitch_feat.ark') as reader:
for key, feat in reader:
pitch_groundtruth[key] = feat
with ReadHelper('ark:testdata/wav.ark') as reader:
for key, wav in reader:
pitch_feat = pitch(wav)
pitch_check = pitch_groundtruth[key]
np.testing.assert_array_almost_equal(
pitch_feat, pitch_check, decimal=4)
if __name__ == '__main__':
unittest.main()

@ -1,281 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import librosa
import numpy as np
import paddle
import paddlespeech.audio
from .base import FeatTest
from paddlespeech.audio.functional.window import get_window
class TestLibrosa(FeatTest):
def initParmas(self):
self.n_fft = 512
self.hop_length = 128
self.n_mels = 40
self.n_mfcc = 20
self.fmin = 0.0
self.window_str = 'hann'
self.pad_mode = 'reflect'
self.top_db = 80.0
def test_stft(self):
if len(self.waveform.shape) == 2: # (C, T)
self.waveform = self.waveform.squeeze(
0) # 1D input for librosa.feature.melspectrogram
feature_librosa = librosa.core.stft(
y=self.waveform,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=None,
window=self.window_str,
center=True,
dtype=None,
pad_mode=self.pad_mode, )
x = paddle.to_tensor(self.waveform).unsqueeze(0)
window = get_window(self.window_str, self.n_fft, dtype=x.dtype)
feature_paddle = paddle.signal.stft(
x=x,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=None,
window=window,
center=True,
pad_mode=self.pad_mode,
normalized=False,
onesided=True, ).squeeze(0)
np.testing.assert_array_almost_equal(
feature_librosa, feature_paddle, decimal=5)
def test_istft(self):
if len(self.waveform.shape) == 2: # (C, T)
self.waveform = self.waveform.squeeze(
0) # 1D input for librosa.feature.melspectrogram
# Get stft result from librosa.
stft_matrix = librosa.core.stft(
y=self.waveform,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=None,
window=self.window_str,
center=True,
pad_mode=self.pad_mode, )
feature_librosa = librosa.core.istft(
stft_matrix=stft_matrix,
hop_length=self.hop_length,
win_length=None,
window=self.window_str,
center=True,
dtype=None,
length=None, )
x = paddle.to_tensor(stft_matrix).unsqueeze(0)
window = get_window(
self.window_str,
self.n_fft,
dtype=paddle.to_tensor(self.waveform).dtype)
feature_paddle = paddle.signal.istft(
x=x,
n_fft=self.n_fft,
hop_length=self.hop_length,
win_length=None,
window=window,
center=True,
normalized=False,
onesided=True,
length=None,
return_complex=False, ).squeeze(0)
np.testing.assert_array_almost_equal(
feature_librosa, feature_paddle, decimal=5)
def test_mel(self):
feature_librosa = librosa.filters.mel(
sr=self.sr,
n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=None,
htk=False,
norm='slaney',
dtype=self.waveform.dtype, )
feature_compliance = paddlespeech.audio.compliance.librosa.compute_fbank_matrix(
sr=self.sr,
n_fft=self.n_fft,
n_mels=self.n_mels,
fmin=self.fmin,
fmax=None,
htk=False,
norm='slaney',
dtype=self.waveform.dtype, )
x = paddle.to_tensor(self.waveform)
feature_functional = paddlespeech.audio.functional.compute_fbank_matrix(
sr=self.sr,
n_fft=self.n_fft,
n_mels=self.n_mels,
f_min=self.fmin,
f_max=None,
htk=False,
norm='slaney',
dtype=x.dtype, )
np.testing.assert_array_almost_equal(feature_librosa,
feature_compliance)
np.testing.assert_array_almost_equal(feature_librosa,
feature_functional)
def test_melspect(self):
if len(self.waveform.shape) == 2: # (C, T)
self.waveform = self.waveform.squeeze(
0) # 1D input for librosa.feature.melspectrogram
# librosa:
feature_librosa = librosa.feature.melspectrogram(
y=self.waveform,
sr=self.sr,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
fmin=self.fmin)
# paddlespeech.audio.compliance.librosa:
feature_compliance = paddlespeech.audio.compliance.librosa.melspectrogram(
x=self.waveform,
sr=self.sr,
window_size=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
fmin=self.fmin,
to_db=False)
# paddlespeech.audio.features.layer
x = paddle.to_tensor(
self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim.
feature_extractor = paddlespeech.audio.features.MelSpectrogram(
sr=self.sr,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
f_min=self.fmin,
dtype=x.dtype)
feature_layer = feature_extractor(x).squeeze(0).numpy()
np.testing.assert_array_almost_equal(
feature_librosa, feature_compliance, decimal=5)
np.testing.assert_array_almost_equal(
feature_librosa, feature_layer, decimal=5)
def test_log_melspect(self):
if len(self.waveform.shape) == 2: # (C, T)
self.waveform = self.waveform.squeeze(
0) # 1D input for librosa.feature.melspectrogram
# librosa:
feature_librosa = librosa.feature.melspectrogram(
y=self.waveform,
sr=self.sr,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
fmin=self.fmin)
feature_librosa = librosa.power_to_db(feature_librosa, top_db=None)
# paddlespeech.audio.compliance.librosa:
feature_compliance = paddlespeech.audio.compliance.librosa.melspectrogram(
x=self.waveform,
sr=self.sr,
window_size=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
fmin=self.fmin)
# paddlespeech.audio.features.layer
x = paddle.to_tensor(
self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim.
feature_extractor = paddlespeech.audio.features.LogMelSpectrogram(
sr=self.sr,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
f_min=self.fmin,
dtype=x.dtype)
feature_layer = feature_extractor(x).squeeze(0).numpy()
np.testing.assert_array_almost_equal(
feature_librosa, feature_compliance, decimal=5)
np.testing.assert_array_almost_equal(
feature_librosa, feature_layer, decimal=4)
def test_mfcc(self):
if len(self.waveform.shape) == 2: # (C, T)
self.waveform = self.waveform.squeeze(
0) # 1D input for librosa.feature.melspectrogram
# librosa:
feature_librosa = librosa.feature.mfcc(
y=self.waveform,
sr=self.sr,
S=None,
n_mfcc=self.n_mfcc,
dct_type=2,
norm='ortho',
lifter=0,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
fmin=self.fmin)
# paddlespeech.audio.compliance.librosa:
feature_compliance = paddlespeech.audio.compliance.librosa.mfcc(
x=self.waveform,
sr=self.sr,
n_mfcc=self.n_mfcc,
dct_type=2,
norm='ortho',
lifter=0,
window_size=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
fmin=self.fmin,
top_db=self.top_db)
# paddlespeech.audio.features.layer
x = paddle.to_tensor(
self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim.
feature_extractor = paddlespeech.audio.features.MFCC(
sr=self.sr,
n_mfcc=self.n_mfcc,
n_fft=self.n_fft,
hop_length=self.hop_length,
n_mels=self.n_mels,
f_min=self.fmin,
top_db=self.top_db,
dtype=x.dtype)
feature_layer = feature_extractor(x).squeeze(0).numpy()
np.testing.assert_array_almost_equal(
feature_librosa, feature_compliance, decimal=4)
np.testing.assert_array_almost_equal(
feature_librosa, feature_layer, decimal=4)
if __name__ == '__main__':
unittest.main()

@ -15,8 +15,8 @@ import unittest
import numpy as np
import paddle
import paddleaudio
import paddlespeech.audio
from .base import FeatTest
from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram
@ -33,7 +33,7 @@ class TestLogMelSpectrogram(FeatTest):
ps_res = ps_melspect(self.waveform.T).squeeze(1).T
x = paddle.to_tensor(self.waveform)
ps_melspect = paddlespeech.audio.features.LogMelSpectrogram(
ps_melspect = paddleaudio.features.LogMelSpectrogram(
self.sr,
self.n_fft,
self.hop_length,

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save