parent
d94996f222
commit
13ee17cdcb
@ -1,3 +0,0 @@
|
||||
|
||||
add_subdirectory(third_party)
|
||||
add_subdirectory(src)
|
@ -1,31 +0,0 @@
|
||||
# PaddleAudio
|
||||
|
||||
## Reference
|
||||
`csrc` code is reference of `torchaudio`.
|
||||
|
||||
```text
|
||||
BSD 2-Clause License
|
||||
|
||||
Copyright (c) [year], [fullname]
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
```
|
@ -1,164 +0,0 @@
|
||||
import os
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
from ._internal import module_utils as _mod_utils # noqa: F401
|
||||
|
||||
|
||||
import contextlib
|
||||
import ctypes
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
|
||||
# Query `hasattr` only once.
|
||||
_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
|
||||
'setdlopenflags')
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def dl_open_guard():
|
||||
"""
|
||||
# https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
|
||||
Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
|
||||
shared library to load custom operators.
|
||||
"""
|
||||
if _SET_GLOBAL_FLAGS:
|
||||
old_flags = sys.getdlopenflags()
|
||||
sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
|
||||
yield
|
||||
if _SET_GLOBAL_FLAGS:
|
||||
sys.setdlopenflags(old_flags)
|
||||
|
||||
|
||||
def resolve_library_path(path: str) -> str:
|
||||
return os.path.realpath(path)
|
||||
|
||||
|
||||
class _Ops(types.ModuleType):
|
||||
#__file__ = '_ops.py'
|
||||
|
||||
def __init__(self):
|
||||
super(_Ops, self).__init__('paddlespeech.ops')
|
||||
self.loaded_libraries = set()
|
||||
|
||||
def load_library(self, path):
|
||||
"""
|
||||
Loads a shared library from the given path into the current process.
|
||||
This allows dynamically loading custom operators. For this,
|
||||
you should compile your operator and
|
||||
the static registration code into a shared library object, and then
|
||||
call ``paddlespeech.ops.load_library('path/to/libcustom.so')`` to load the
|
||||
shared object.
|
||||
After the library is loaded, it is added to the
|
||||
``paddlespeech.ops.loaded_libraries`` attribute, a set that may be inspected
|
||||
for the paths of all libraries loaded using this function.
|
||||
Args:
|
||||
path (str): A path to a shared library to load.
|
||||
"""
|
||||
path = resolve_library_path(path)
|
||||
with dl_open_guard():
|
||||
# https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
|
||||
# Import the shared library into the process, thus running its
|
||||
# static (global) initialization code in order to register custom
|
||||
# operators with the JIT.
|
||||
ctypes.CDLL(path)
|
||||
self.loaded_libraries.add(path)
|
||||
|
||||
|
||||
_LIB_DIR = Path(__file__).parent / "lib"
|
||||
|
||||
def _get_lib_path(lib: str):
|
||||
suffix = "pyd" if os.name == "nt" else "so"
|
||||
path = _LIB_DIR / f"{lib}.{suffix}"
|
||||
return path
|
||||
|
||||
|
||||
def _load_lib(lib: str) -> bool:
|
||||
"""Load extension module
|
||||
Note:
|
||||
In case `paddleaudio` is deployed with `pex` format, the library file
|
||||
is not in a standard location.
|
||||
In this case, we expect that `libpaddlleaudio` is available somewhere
|
||||
in the search path of dynamic loading mechanism, so that importing
|
||||
`_paddlleaudio` will have library loader find and load `libpaddlleaudio`.
|
||||
This is the reason why the function should not raising an error when the library
|
||||
file is not found.
|
||||
Returns:
|
||||
bool:
|
||||
True if the library file is found AND the library loaded without failure.
|
||||
False if the library file is not found (like in the case where paddlleaudio
|
||||
is deployed with pex format, thus the shared library file is
|
||||
in a non-standard location.).
|
||||
If the library file is found but there is an issue loading the library,
|
||||
(such as missing dependency) then this function raises the exception as-is.
|
||||
Raises:
|
||||
Exception:
|
||||
If the library file is found, but there is an issue loading the library file,
|
||||
(when underlying `ctype.DLL` throws an exception), this function will pass
|
||||
the exception as-is, instead of catching it and returning bool.
|
||||
The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency
|
||||
is not found.
|
||||
This behavior was chosen because the expected failure case is not recoverable.
|
||||
If a dependency is missing, then users have to install it.
|
||||
"""
|
||||
path = _get_lib_path(lib)
|
||||
if not path.exists():
|
||||
warnings.warn("lib path is not exists:" + str(path))
|
||||
return False
|
||||
#paddlespeech.audio.ops.load_library(path)
|
||||
ops.load_library(path)
|
||||
return True
|
||||
|
||||
|
||||
_FFMPEG_INITIALIZED = False
|
||||
|
||||
|
||||
def _init_ffmpeg():
|
||||
global _FFMPEG_INITIALIZED
|
||||
if _FFMPEG_INITIALIZED:
|
||||
return
|
||||
|
||||
if not paddlespeech.audio._paddlleaudio.is_ffmpeg_available():
|
||||
raise RuntimeError(
|
||||
"paddlleaudio is not compiled with FFmpeg integration. Please set USE_FFMPEG=1 when compiling paddlleaudio."
|
||||
)
|
||||
|
||||
try:
|
||||
_load_lib("libpaddlleaudio_ffmpeg")
|
||||
except OSError as err:
|
||||
raise ImportError(
|
||||
"FFmpeg libraries are not found. Please install FFmpeg.") from err
|
||||
|
||||
import paddllespeech.audio._paddlleaudio_ffmpeg # noqa
|
||||
|
||||
paddlespeech.audio._paddlleaudio.ffmpeg_init()
|
||||
if paddlespeech.audio._paddlleaudio.ffmpeg_get_log_level() > 8:
|
||||
paddlespeech.audio._paddlleaudio.ffmpeg_set_log_level(8)
|
||||
|
||||
_FFMPEG_INITIALIZED = True
|
||||
|
||||
|
||||
def _init_extension():
|
||||
if not _mod_utils.is_module_available("paddlespeech.audio._paddleaudio"):
|
||||
warnings.warn("paddlespeech C++ extension is not available.")
|
||||
return
|
||||
|
||||
_load_lib("libpaddleaudio")
|
||||
# This import is for initializing the methods registered via PyBind11
|
||||
# This has to happen after the base library is loaded
|
||||
from paddlespeech.audio import _paddleaudio # noqa
|
||||
|
||||
# Because this part is executed as part of `import torchaudio`, we ignore the
|
||||
# initialization failure.
|
||||
# If the FFmpeg integration is not properly initialized, then detailed error
|
||||
# will be raised when client code attempts to import the dedicated feature.
|
||||
try:
|
||||
_init_ffmpeg()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
ops = _Ops()
|
||||
|
||||
_init_extension()
|
@ -1,148 +0,0 @@
|
||||
import importlib.util
|
||||
import warnings
|
||||
from functools import wraps
|
||||
from typing import Optional
|
||||
|
||||
#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py
|
||||
|
||||
|
||||
def is_module_available(*modules: str) -> bool:
|
||||
r"""Returns if a top-level module with :attr:`name` exists *without**
|
||||
importing it. This is generally safer than try-catch block around a
|
||||
`import X`. It avoids third party libraries breaking assumptions of some of
|
||||
our tests, e.g., setting multiprocessing start method when imported
|
||||
(see librosa/#747, torchvision/#544).
|
||||
"""
|
||||
return all(importlib.util.find_spec(m) is not None for m in modules)
|
||||
|
||||
|
||||
def requires_module(*modules: str):
|
||||
"""Decorate function to give error message if invoked without required optional modules.
|
||||
This decorator is to give better error message to users rather
|
||||
than raising ``NameError: name 'module' is not defined`` at random places.
|
||||
"""
|
||||
missing = [m for m in modules if not is_module_available(m)]
|
||||
|
||||
if not missing:
|
||||
# fall through. If all the modules are available, no need to decorate
|
||||
def decorator(func):
|
||||
return func
|
||||
|
||||
else:
|
||||
req = f"module: {missing[0]}" if len(
|
||||
missing) == 1 else f"modules: {missing}"
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
raise RuntimeError(
|
||||
f"{func.__module__}.{func.__name__} requires {req}")
|
||||
|
||||
return wrapped
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def deprecated(direction: str, version: Optional[str]=None):
|
||||
"""Decorator to add deprecation message
|
||||
Args:
|
||||
direction (str): Migration steps to be given to users.
|
||||
version (str or int): The version when the object will be removed
|
||||
"""
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
message = (
|
||||
f"{func.__module__}.{func.__name__} has been deprecated "
|
||||
f'and will be removed from {"future" if version is None else version} release. '
|
||||
f"{direction}")
|
||||
warnings.warn(message, stacklevel=2)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapped
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def is_kaldi_available():
|
||||
return is_module_available("paddlespeech.audio._paddleaudio")
|
||||
|
||||
|
||||
def requires_kaldi():
|
||||
if is_kaldi_available():
|
||||
|
||||
def decorator(func):
|
||||
return func
|
||||
|
||||
else:
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
raise RuntimeError(
|
||||
f"{func.__module__}.{func.__name__} requires kaldi")
|
||||
|
||||
return wrapped
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def _check_soundfile_importable():
|
||||
if not is_module_available("soundfile"):
|
||||
return False
|
||||
try:
|
||||
import soundfile # noqa: F401
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
warnings.warn(
|
||||
"Failed to import soundfile. 'soundfile' backend is not available.")
|
||||
return False
|
||||
|
||||
|
||||
_is_soundfile_importable = _check_soundfile_importable()
|
||||
|
||||
|
||||
def is_soundfile_available():
|
||||
return _is_soundfile_importable
|
||||
|
||||
|
||||
def requires_soundfile():
|
||||
if is_soundfile_available():
|
||||
|
||||
def decorator(func):
|
||||
return func
|
||||
else:
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
raise RuntimeError(
|
||||
f"{func.__module__}.{func.__name__} requires soundfile")
|
||||
|
||||
return wrapped
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def is_sox_available():
|
||||
return is_module_available("paddlespeech.audio._paddleaudio")
|
||||
|
||||
|
||||
def requires_sox():
|
||||
if is_sox_available():
|
||||
|
||||
def decorator(func):
|
||||
return func
|
||||
else:
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
raise RuntimeError(
|
||||
f"{func.__module__}.{func.__name__} requires sox")
|
||||
|
||||
return wrapped
|
||||
|
||||
return decorator
|
@ -1,63 +0,0 @@
|
||||
import contextlib
|
||||
import ctypes
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
|
||||
# Query `hasattr` only once.
|
||||
_SET_GLOBAL_FLAGS = hasattr(sys, 'getdlopenflags') and hasattr(sys,
|
||||
'setdlopenflags')
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def dl_open_guard():
|
||||
"""
|
||||
# https://manpages.debian.org/bullseye/manpages-dev/dlopen.3.en.html
|
||||
Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
|
||||
shared library to load custom operators.
|
||||
"""
|
||||
if _SET_GLOBAL_FLAGS:
|
||||
old_flags = sys.getdlopenflags()
|
||||
sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
|
||||
yield
|
||||
if _SET_GLOBAL_FLAGS:
|
||||
sys.setdlopenflags(old_flags)
|
||||
|
||||
|
||||
def resolve_library_path(path: str) -> str:
|
||||
return os.path.realpath(path)
|
||||
|
||||
|
||||
class _Ops(types.ModuleType):
|
||||
__file__ = '_ops.py'
|
||||
|
||||
def __init__(self):
|
||||
super(_Ops, self).__init__('paddlespeech.ops')
|
||||
self.loaded_libraries = set()
|
||||
|
||||
def load_library(self, path):
|
||||
"""
|
||||
Loads a shared library from the given path into the current process.
|
||||
This allows dynamically loading custom operators. For this,
|
||||
you should compile your operator and
|
||||
the static registration code into a shared library object, and then
|
||||
call ``paddlespeech.ops.load_library('path/to/libcustom.so')`` to load the
|
||||
shared object.
|
||||
After the library is loaded, it is added to the
|
||||
``paddlespeech.ops.loaded_libraries`` attribute, a set that may be inspected
|
||||
for the paths of all libraries loaded using this function.
|
||||
Args:
|
||||
path (str): A path to a shared library to load.
|
||||
"""
|
||||
path = resolve_library_path(path)
|
||||
with dl_open_guard():
|
||||
# https://docs.python.org/3/library/ctypes.html?highlight=ctypes#loading-shared-libraries
|
||||
# Import the shared library into the process, thus running its
|
||||
# static (global) initialization code in order to register custom
|
||||
# operators with the JIT.
|
||||
ctypes.CDLL(path)
|
||||
self.loaded_libraries.add(path)
|
||||
|
||||
|
||||
# The ops "namespace"
|
||||
ops = _Ops()
|
@ -1,18 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# flake8: noqa
|
||||
from . import utils
|
||||
from .utils import get_audio_backend
|
||||
from .utils import list_audio_backends
|
||||
from .utils import set_audio_backend
|
@ -1,55 +0,0 @@
|
||||
# code from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py
|
||||
|
||||
class AudioMetaData:
|
||||
"""Return type of ``torchaudio.info`` function.
|
||||
|
||||
This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
|
||||
:ref:`"soundfile" backend with the new interface<soundfile_backend>`.
|
||||
|
||||
:ivar int sample_rate: Sample rate
|
||||
:ivar int num_frames: The number of frames
|
||||
:ivar int num_channels: The number of channels
|
||||
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
|
||||
or when it cannot be accurately inferred.
|
||||
:ivar str encoding: Audio encoding
|
||||
The values encoding can take are one of the following:
|
||||
|
||||
* ``PCM_S``: Signed integer linear PCM
|
||||
* ``PCM_U``: Unsigned integer linear PCM
|
||||
* ``PCM_F``: Floating point linear PCM
|
||||
* ``FLAC``: Flac, Free Lossless Audio Codec
|
||||
* ``ULAW``: Mu-law
|
||||
* ``ALAW``: A-law
|
||||
* ``MP3`` : MP3, MPEG-1 Audio Layer III
|
||||
* ``VORBIS``: OGG Vorbis
|
||||
* ``AMR_WB``: Adaptive Multi-Rate
|
||||
* ``AMR_NB``: Adaptive Multi-Rate Wideband
|
||||
* ``OPUS``: Opus
|
||||
* ``HTK``: Single channel 16-bit PCM
|
||||
* ``UNKNOWN`` : None of above
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate: int,
|
||||
num_frames: int,
|
||||
num_channels: int,
|
||||
bits_per_sample: int,
|
||||
encoding: str,
|
||||
):
|
||||
self.sample_rate = sample_rate
|
||||
self.num_frames = num_frames
|
||||
self.num_channels = num_channels
|
||||
self.bits_per_sample = bits_per_sample
|
||||
self.encoding = encoding
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f"AudioMetaData("
|
||||
f"sample_rate={self.sample_rate}, "
|
||||
f"num_frames={self.num_frames}, "
|
||||
f"num_channels={self.num_channels}, "
|
||||
f"bits_per_sample={self.bits_per_sample}, "
|
||||
f"encoding={self.encoding}"
|
||||
f")"
|
||||
)
|
@ -1,32 +0,0 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
from typing import Optional
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
from paddle import Tensor
|
||||
|
||||
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
|
||||
|
||||
|
||||
def load(
|
||||
filepath: Union[str, Path],
|
||||
out: Optional[Tensor]=None,
|
||||
normalization: Union[bool, float, Callable]=True,
|
||||
channels_first: bool=True,
|
||||
num_frames: int=0,
|
||||
offset: int=0,
|
||||
filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
|
||||
raise RuntimeError("No audio I/O backend is available.")
|
||||
|
||||
|
||||
def save(filepath: str,
|
||||
src: Tensor,
|
||||
sample_rate: int,
|
||||
precision: int=16,
|
||||
channels_first: bool=True) -> None:
|
||||
raise RuntimeError("No audio I/O backend is available.")
|
||||
|
||||
|
||||
def info(filepath: str) -> None:
|
||||
raise RuntimeError("No audio I/O backend is available.")
|
@ -1,662 +0,0 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import warnings
|
||||
from typing import Optional
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import resampy
|
||||
import soundfile
|
||||
from scipy.io import wavfile
|
||||
|
||||
from ..utils import depth_convert
|
||||
from ..utils import ParameterError
|
||||
from .common import AudioMetaData
|
||||
|
||||
__all__ = [
|
||||
'resample',
|
||||
'to_mono',
|
||||
'normalize',
|
||||
'save',
|
||||
'soundfile_save',
|
||||
'load',
|
||||
'soundfile_load',
|
||||
'info',
|
||||
'to_mono'
|
||||
]
|
||||
NORMALMIZE_TYPES = ['linear', 'gaussian']
|
||||
MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
|
||||
RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
|
||||
EPS = 1e-8
|
||||
|
||||
|
||||
def resample(y: np.ndarray,
|
||||
src_sr: int,
|
||||
target_sr: int,
|
||||
mode: str='kaiser_fast') -> np.ndarray:
|
||||
"""Audio resampling.
|
||||
|
||||
Args:
|
||||
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||
src_sr (int): Source sample rate.
|
||||
target_sr (int): Target sample rate.
|
||||
mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
|
||||
|
||||
Returns:
|
||||
np.ndarray: `y` resampled to `target_sr`
|
||||
"""
|
||||
|
||||
if mode == 'kaiser_best':
|
||||
warnings.warn(
|
||||
f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
|
||||
we recommend the mode kaiser_fast in large scale audio trainning')
|
||||
|
||||
if not isinstance(y, np.ndarray):
|
||||
raise ParameterError(
|
||||
'Only support numpy np.ndarray, but received y in {type(y)}')
|
||||
|
||||
if mode not in RESAMPLE_MODES:
|
||||
raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
|
||||
|
||||
return resampy.resample(y, src_sr, target_sr, filter=mode)
|
||||
|
||||
|
||||
def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
|
||||
"""Convert sterior audio to mono.
|
||||
|
||||
Args:
|
||||
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||
merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
|
||||
|
||||
Returns:
|
||||
np.ndarray: `y` with mono channel.
|
||||
"""
|
||||
|
||||
if merge_type not in MERGE_TYPES:
|
||||
raise ParameterError(
|
||||
f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
|
||||
)
|
||||
if y.ndim > 2:
|
||||
raise ParameterError(
|
||||
f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}')
|
||||
if y.ndim == 1: # nothing to merge
|
||||
return y
|
||||
|
||||
if merge_type == 'ch0':
|
||||
return y[0]
|
||||
if merge_type == 'ch1':
|
||||
return y[1]
|
||||
if merge_type == 'random':
|
||||
return y[np.random.randint(0, 2)]
|
||||
|
||||
# need to do averaging according to dtype
|
||||
|
||||
if y.dtype == 'float32':
|
||||
y_out = (y[0] + y[1]) * 0.5
|
||||
elif y.dtype == 'int16':
|
||||
y_out = y.astype('int32')
|
||||
y_out = (y_out[0] + y_out[1]) // 2
|
||||
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
|
||||
np.iinfo(y.dtype).max).astype(y.dtype)
|
||||
|
||||
elif y.dtype == 'int8':
|
||||
y_out = y.astype('int16')
|
||||
y_out = (y_out[0] + y_out[1]) // 2
|
||||
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
|
||||
np.iinfo(y.dtype).max).astype(y.dtype)
|
||||
else:
|
||||
raise ParameterError(f'Unsupported dtype: {y.dtype}')
|
||||
return y_out
|
||||
|
||||
|
||||
def soundfile_load_(file: os.PathLike,
|
||||
offset: Optional[float]=None,
|
||||
dtype: str='int16',
|
||||
duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
|
||||
"""Load audio using soundfile library. This function load audio file using libsndfile.
|
||||
|
||||
Args:
|
||||
file (os.PathLike): File of waveform.
|
||||
offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
|
||||
dtype (str, optional): Data type of waveform. Defaults to 'int16'.
|
||||
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
|
||||
"""
|
||||
with soundfile.SoundFile(file) as sf_desc:
|
||||
sr_native = sf_desc.samplerate
|
||||
if offset:
|
||||
sf_desc.seek(int(offset * sr_native))
|
||||
if duration is not None:
|
||||
frame_duration = int(duration * sr_native)
|
||||
else:
|
||||
frame_duration = -1
|
||||
y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
|
||||
|
||||
return y, sf_desc.samplerate
|
||||
|
||||
|
||||
def normalize(y: np.ndarray, norm_type: str='linear',
|
||||
mul_factor: float=1.0) -> np.ndarray:
|
||||
"""Normalize an input audio with additional multiplier.
|
||||
|
||||
Args:
|
||||
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
|
||||
mul_factor (float, optional): Scaling factor. Defaults to 1.0.
|
||||
|
||||
Returns:
|
||||
np.ndarray: `y` after normalization.
|
||||
"""
|
||||
|
||||
if norm_type == 'linear':
|
||||
amax = np.max(np.abs(y))
|
||||
factor = 1.0 / (amax + EPS)
|
||||
y = y * factor * mul_factor
|
||||
elif norm_type == 'gaussian':
|
||||
amean = np.mean(y)
|
||||
astd = np.std(y)
|
||||
astd = max(astd, EPS)
|
||||
y = mul_factor * (y - amean) / astd
|
||||
else:
|
||||
raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
|
||||
|
||||
return y
|
||||
|
||||
|
||||
def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
|
||||
"""Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
|
||||
|
||||
Args:
|
||||
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||
sr (int): Sample rate.
|
||||
file (os.PathLike): Path of auido file to save.
|
||||
"""
|
||||
if not file.endswith('.wav'):
|
||||
raise ParameterError(
|
||||
f'only .wav file supported, but dst file name is: {file}')
|
||||
|
||||
if sr <= 0:
|
||||
raise ParameterError(
|
||||
f'Sample rate should be larger than 0, recieved sr = {sr}')
|
||||
|
||||
if y.dtype not in ['int16', 'int8']:
|
||||
warnings.warn(
|
||||
f'input data type is {y.dtype}, will convert data to int16 format before saving'
|
||||
)
|
||||
y_out = depth_convert(y, 'int16')
|
||||
else:
|
||||
y_out = y
|
||||
|
||||
wavfile.write(file, sr, y_out)
|
||||
|
||||
def soundfile_load(
|
||||
file: os.PathLike,
|
||||
sr: Optional[int]=None,
|
||||
mono: bool=True,
|
||||
merge_type: str='average', # ch0,ch1,random,average
|
||||
normal: bool=True,
|
||||
norm_type: str='linear',
|
||||
norm_mul_factor: float=1.0,
|
||||
offset: float=0.0,
|
||||
duration: Optional[int]=None,
|
||||
dtype: str='float32',
|
||||
resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
|
||||
"""Load audio file from disk. This function loads audio from disk using using audio beackend.
|
||||
|
||||
Args:
|
||||
file (os.PathLike): Path of auido file to load.
|
||||
sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
|
||||
mono (bool, optional): Return waveform with mono channel. Defaults to True.
|
||||
merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
|
||||
normal (bool, optional): Waveform normalization. Defaults to True.
|
||||
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
|
||||
norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
|
||||
offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
|
||||
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
|
||||
dtype (str, optional): Data type of waveform. Defaults to 'float32'.
|
||||
resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
|
||||
|
||||
Returns:
|
||||
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
|
||||
"""
|
||||
|
||||
y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
|
||||
|
||||
if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
|
||||
raise ParameterError(f'audio file {file} looks empty')
|
||||
|
||||
if mono:
|
||||
y = to_mono(y, merge_type)
|
||||
|
||||
if sr is not None and sr != r:
|
||||
y = resample(y, r, sr, mode=resample_mode)
|
||||
r = sr
|
||||
|
||||
if normal:
|
||||
y = normalize(y, norm_type, norm_mul_factor)
|
||||
elif dtype in ['int8', 'int16']:
|
||||
# still need to do normalization, before depth convertion
|
||||
y = normalize(y, 'linear', 1.0)
|
||||
|
||||
y = depth_convert(y, dtype)
|
||||
return y, r
|
||||
|
||||
#the code below is form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py
|
||||
|
||||
def _get_subtype_for_wav(dtype: paddle.dtype, encoding: str, bits_per_sample: int):
|
||||
if not encoding:
|
||||
if not bits_per_sample:
|
||||
subtype = {
|
||||
paddle.uint8: "PCM_U8",
|
||||
paddle.int16: "PCM_16",
|
||||
paddle.int32: "PCM_32",
|
||||
paddle.float32: "FLOAT",
|
||||
paddle.float64: "DOUBLE",
|
||||
}.get(dtype)
|
||||
if not subtype:
|
||||
raise ValueError(f"Unsupported dtype for wav: {dtype}")
|
||||
return subtype
|
||||
if bits_per_sample == 8:
|
||||
return "PCM_U8"
|
||||
return f"PCM_{bits_per_sample}"
|
||||
if encoding == "PCM_S":
|
||||
if not bits_per_sample:
|
||||
return "PCM_32"
|
||||
if bits_per_sample == 8:
|
||||
raise ValueError("wav does not support 8-bit signed PCM encoding.")
|
||||
return f"PCM_{bits_per_sample}"
|
||||
if encoding == "PCM_U":
|
||||
if bits_per_sample in (None, 8):
|
||||
return "PCM_U8"
|
||||
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
|
||||
if encoding == "PCM_F":
|
||||
if bits_per_sample in (None, 32):
|
||||
return "FLOAT"
|
||||
if bits_per_sample == 64:
|
||||
return "DOUBLE"
|
||||
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
|
||||
if encoding == "ULAW":
|
||||
if bits_per_sample in (None, 8):
|
||||
return "ULAW"
|
||||
raise ValueError("wav only supports 8-bit mu-law encoding.")
|
||||
if encoding == "ALAW":
|
||||
if bits_per_sample in (None, 8):
|
||||
return "ALAW"
|
||||
raise ValueError("wav only supports 8-bit a-law encoding.")
|
||||
raise ValueError(f"wav does not support {encoding}.")
|
||||
|
||||
|
||||
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
|
||||
if encoding in (None, "PCM_S"):
|
||||
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
|
||||
if encoding in ("PCM_U", "PCM_F"):
|
||||
raise ValueError(f"sph does not support {encoding} encoding.")
|
||||
if encoding == "ULAW":
|
||||
if bits_per_sample in (None, 8):
|
||||
return "ULAW"
|
||||
raise ValueError("sph only supports 8-bit for mu-law encoding.")
|
||||
if encoding == "ALAW":
|
||||
return "ALAW"
|
||||
raise ValueError(f"sph does not support {encoding}.")
|
||||
|
||||
|
||||
def _get_subtype(dtype: paddle.dtype, format: str, encoding: str, bits_per_sample: int):
|
||||
if format == "wav":
|
||||
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
|
||||
if format == "flac":
|
||||
if encoding:
|
||||
raise ValueError("flac does not support encoding.")
|
||||
if not bits_per_sample:
|
||||
return "PCM_16"
|
||||
if bits_per_sample > 24:
|
||||
raise ValueError("flac does not support bits_per_sample > 24.")
|
||||
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
|
||||
if format in ("ogg", "vorbis"):
|
||||
if encoding or bits_per_sample:
|
||||
raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
|
||||
return "VORBIS"
|
||||
if format == "sph":
|
||||
return _get_subtype_for_sphere(encoding, bits_per_sample)
|
||||
if format in ("nis", "nist"):
|
||||
return "PCM_16"
|
||||
raise ValueError(f"Unsupported format: {format}")
|
||||
|
||||
def save(
|
||||
filepath: str,
|
||||
src: paddle.Tensor,
|
||||
sample_rate: int,
|
||||
channels_first: bool = True,
|
||||
compression: Optional[float] = None,
|
||||
format: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
bits_per_sample: Optional[int] = None,
|
||||
):
|
||||
"""Save audio data to file.
|
||||
|
||||
Note:
|
||||
The formats this function can handle depend on the soundfile installation.
|
||||
This function is tested on the following formats;
|
||||
|
||||
* WAV
|
||||
|
||||
* 32-bit floating-point
|
||||
* 32-bit signed integer
|
||||
* 16-bit signed integer
|
||||
* 8-bit unsigned integer
|
||||
|
||||
* FLAC
|
||||
* OGG/VORBIS
|
||||
* SPHERE
|
||||
|
||||
Note:
|
||||
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
||||
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
||||
|
||||
Args:
|
||||
filepath (str or pathlib.Path): Path to audio file.
|
||||
src (paddle.Tensor): Audio data to save. must be 2D tensor.
|
||||
sample_rate (int): sampling rate
|
||||
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
|
||||
otherwise `[time, channel]`.
|
||||
compression (float of None, optional): Not used.
|
||||
It is here only for interface compatibility reson with "sox_io" backend.
|
||||
format (str or None, optional): Override the audio format.
|
||||
When ``filepath`` argument is path-like object, audio format is
|
||||
inferred from file extension. If the file extension is missing or
|
||||
different, you can specify the correct format with this argument.
|
||||
|
||||
When ``filepath`` argument is file-like object,
|
||||
this argument is required.
|
||||
|
||||
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
|
||||
``"flac"`` and ``"sph"``.
|
||||
encoding (str or None, optional): Changes the encoding for supported formats.
|
||||
This argument is effective only for supported formats, sush as
|
||||
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
|
||||
|
||||
- ``"PCM_S"`` (signed integer Linear PCM)
|
||||
- ``"PCM_U"`` (unsigned integer Linear PCM)
|
||||
- ``"PCM_F"`` (floating point PCM)
|
||||
- ``"ULAW"`` (mu-law)
|
||||
- ``"ALAW"`` (a-law)
|
||||
|
||||
bits_per_sample (int or None, optional): Changes the bit depth for the
|
||||
supported formats.
|
||||
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
|
||||
you can change the bit depth.
|
||||
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
|
||||
|
||||
Supported formats/encodings/bit depth/compression are:
|
||||
|
||||
``"wav"``
|
||||
- 32-bit floating-point PCM
|
||||
- 32-bit signed integer PCM
|
||||
- 24-bit signed integer PCM
|
||||
- 16-bit signed integer PCM
|
||||
- 8-bit unsigned integer PCM
|
||||
- 8-bit mu-law
|
||||
- 8-bit a-law
|
||||
|
||||
Note:
|
||||
Default encoding/bit depth is determined by the dtype of
|
||||
the input Tensor.
|
||||
|
||||
``"flac"``
|
||||
- 8-bit
|
||||
- 16-bit (default)
|
||||
- 24-bit
|
||||
|
||||
``"ogg"``, ``"vorbis"``
|
||||
- Doesn't accept changing configuration.
|
||||
|
||||
``"sph"``
|
||||
- 8-bit signed integer PCM
|
||||
- 16-bit signed integer PCM
|
||||
- 24-bit signed integer PCM
|
||||
- 32-bit signed integer PCM (default)
|
||||
- 8-bit mu-law
|
||||
- 8-bit a-law
|
||||
- 16-bit a-law
|
||||
- 24-bit a-law
|
||||
- 32-bit a-law
|
||||
|
||||
"""
|
||||
if src.ndim != 2:
|
||||
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
|
||||
if compression is not None:
|
||||
warnings.warn(
|
||||
'`save` function of "soundfile" backend does not support "compression" parameter. '
|
||||
"The argument is silently ignored."
|
||||
)
|
||||
if hasattr(filepath, "write"):
|
||||
if format is None:
|
||||
raise RuntimeError("`format` is required when saving to file object.")
|
||||
ext = format.lower()
|
||||
else:
|
||||
ext = str(filepath).split(".")[-1].lower()
|
||||
|
||||
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
|
||||
raise ValueError("Invalid bits_per_sample.")
|
||||
if bits_per_sample == 24:
|
||||
warnings.warn(
|
||||
"Saving audio with 24 bits per sample might warp samples near -1. "
|
||||
"Using 16 bits per sample might be able to avoid this."
|
||||
)
|
||||
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
|
||||
|
||||
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
|
||||
# so we extend the extensions manually here
|
||||
if ext in ["nis", "nist", "sph"] and format is None:
|
||||
format = "NIST"
|
||||
|
||||
if channels_first:
|
||||
src = src.t()
|
||||
|
||||
soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
|
||||
|
||||
_SUBTYPE2DTYPE = {
|
||||
"PCM_S8": "int8",
|
||||
"PCM_U8": "uint8",
|
||||
"PCM_16": "int16",
|
||||
"PCM_32": "int32",
|
||||
"FLOAT": "float32",
|
||||
"DOUBLE": "float64",
|
||||
}
|
||||
|
||||
def load(
|
||||
filepath: str,
|
||||
frame_offset: int = 0,
|
||||
num_frames: int = -1,
|
||||
normalize: bool = True,
|
||||
channels_first: bool = True,
|
||||
format: Optional[str] = None,
|
||||
) -> Tuple[paddle.Tensor, int]:
|
||||
"""Load audio data from file.
|
||||
|
||||
Note:
|
||||
The formats this function can handle depend on the soundfile installation.
|
||||
This function is tested on the following formats;
|
||||
|
||||
* WAV
|
||||
|
||||
* 32-bit floating-point
|
||||
* 32-bit signed integer
|
||||
* 16-bit signed integer
|
||||
* 8-bit unsigned integer
|
||||
|
||||
* FLAC
|
||||
* OGG/VORBIS
|
||||
* SPHERE
|
||||
|
||||
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
|
||||
``float32`` dtype and the shape of `[channel, time]`.
|
||||
The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
|
||||
|
||||
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
|
||||
signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
|
||||
by providing ``normalize=False``, this function can return integer Tensor, where the samples
|
||||
are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
|
||||
for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
|
||||
|
||||
``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
|
||||
``flac`` and ``mp3``.
|
||||
For these formats, this function always returns ``float32`` Tensor with values normalized to
|
||||
``[-1.0, 1.0]``.
|
||||
|
||||
Note:
|
||||
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
||||
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
|
||||
|
||||
Args:
|
||||
filepath (path-like object or file-like object):
|
||||
Source of audio data.
|
||||
frame_offset (int, optional):
|
||||
Number of frames to skip before start reading data.
|
||||
num_frames (int, optional):
|
||||
Maximum number of frames to read. ``-1`` reads all the remaining samples,
|
||||
starting from ``frame_offset``.
|
||||
This function may return the less number of frames if there is not enough
|
||||
frames in the given file.
|
||||
normalize (bool, optional):
|
||||
When ``True``, this function always return ``float32``, and sample values are
|
||||
normalized to ``[-1.0, 1.0]``.
|
||||
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
||||
integer type.
|
||||
This argument has no effect for formats other than integer WAV type.
|
||||
channels_first (bool, optional):
|
||||
When True, the returned Tensor has dimension `[channel, time]`.
|
||||
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
||||
format (str or None, optional):
|
||||
Not used. PySoundFile does not accept format hint.
|
||||
|
||||
Returns:
|
||||
(paddle.Tensor, int): Resulting Tensor and sample rate.
|
||||
If the input file has integer wav format and normalization is off, then it has
|
||||
integer type, else ``float32`` type. If ``channels_first=True``, it has
|
||||
`[channel, time]` else `[time, channel]`.
|
||||
"""
|
||||
with soundfile.SoundFile(filepath, "r") as file_:
|
||||
if file_.format != "WAV" or normalize:
|
||||
dtype = "float32"
|
||||
elif file_.subtype not in _SUBTYPE2DTYPE:
|
||||
raise ValueError(f"Unsupported subtype: {file_.subtype}")
|
||||
else:
|
||||
dtype = _SUBTYPE2DTYPE[file_.subtype]
|
||||
|
||||
frames = file_._prepare_read(frame_offset, None, num_frames)
|
||||
waveform = file_.read(frames, dtype, always_2d=True)
|
||||
sample_rate = file_.samplerate
|
||||
|
||||
waveform = paddle.to_tensor(waveform)
|
||||
if channels_first:
|
||||
waveform = paddle.transpose(waveform, perm=[1,0])
|
||||
return waveform, sample_rate
|
||||
|
||||
|
||||
# Mapping from soundfile subtype to number of bits per sample.
|
||||
# This is mostly heuristical and the value is set to 0 when it is irrelevant
|
||||
# (lossy formats) or when it can't be inferred.
|
||||
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
|
||||
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
|
||||
# the default seems to be 8 bits but it can be compressed further to 4 bits.
|
||||
# The dict is inspired from
|
||||
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
|
||||
_SUBTYPE_TO_BITS_PER_SAMPLE = {
|
||||
"PCM_S8": 8, # Signed 8 bit data
|
||||
"PCM_16": 16, # Signed 16 bit data
|
||||
"PCM_24": 24, # Signed 24 bit data
|
||||
"PCM_32": 32, # Signed 32 bit data
|
||||
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
|
||||
"FLOAT": 32, # 32 bit float data
|
||||
"DOUBLE": 64, # 64 bit float data
|
||||
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
||||
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
||||
"IMA_ADPCM": 0, # IMA ADPCM.
|
||||
"MS_ADPCM": 0, # Microsoft ADPCM.
|
||||
"GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
|
||||
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
|
||||
"G721_32": 0, # 32kbs G721 ADPCM encoding.
|
||||
"G723_24": 0, # 24kbs G723 ADPCM encoding.
|
||||
"G723_40": 0, # 40kbs G723 ADPCM encoding.
|
||||
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
|
||||
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
|
||||
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
|
||||
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
|
||||
"DPCM_8": 8, # 8 bit differential PCM (XI only)
|
||||
"DPCM_16": 16, # 16 bit differential PCM (XI only)
|
||||
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
|
||||
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
|
||||
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
|
||||
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
|
||||
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
|
||||
}
|
||||
|
||||
def _get_bit_depth(subtype):
|
||||
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
|
||||
warnings.warn(
|
||||
f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
|
||||
"attribute will be set to 0. If you are seeing this warning, please "
|
||||
"report by opening an issue on github (after checking for existing/closed ones). "
|
||||
"You may otherwise ignore this warning."
|
||||
)
|
||||
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
|
||||
|
||||
_SUBTYPE_TO_ENCODING = {
|
||||
"PCM_S8": "PCM_S",
|
||||
"PCM_16": "PCM_S",
|
||||
"PCM_24": "PCM_S",
|
||||
"PCM_32": "PCM_S",
|
||||
"PCM_U8": "PCM_U",
|
||||
"FLOAT": "PCM_F",
|
||||
"DOUBLE": "PCM_F",
|
||||
"ULAW": "ULAW",
|
||||
"ALAW": "ALAW",
|
||||
"VORBIS": "VORBIS",
|
||||
}
|
||||
|
||||
def _get_encoding(format: str, subtype: str):
|
||||
if format == "FLAC":
|
||||
return "FLAC"
|
||||
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
|
||||
|
||||
def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
|
||||
"""Get signal information of an audio file.
|
||||
|
||||
Note:
|
||||
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
||||
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
||||
|
||||
Args:
|
||||
filepath (path-like object or file-like object):
|
||||
Source of audio data.
|
||||
format (str or None, optional):
|
||||
Not used. PySoundFile does not accept format hint.
|
||||
|
||||
Returns:
|
||||
AudioMetaData: meta data of the given audio.
|
||||
|
||||
"""
|
||||
sinfo = soundfile.info(filepath)
|
||||
return AudioMetaData(
|
||||
sinfo.samplerate,
|
||||
sinfo.frames,
|
||||
sinfo.channels,
|
||||
bits_per_sample=_get_bit_depth(sinfo.subtype),
|
||||
encoding=_get_encoding(sinfo.format, sinfo.subtype),
|
||||
)
|
@ -1,101 +0,0 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
from .common import AudioMetaData
|
||||
import os
|
||||
|
||||
from paddlespeech.audio._internal import module_utils as _mod_utils
|
||||
from paddlespeech.audio import _paddleaudio as paddleaudio
|
||||
|
||||
#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
|
||||
|
||||
def _fail_info(filepath: str, format: Optional[str]) -> AudioMetaData:
|
||||
raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
|
||||
|
||||
|
||||
def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioMetaData:
|
||||
raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
|
||||
|
||||
|
||||
# Note: need to comply TorchScript syntax -- need annotation and no f-string
|
||||
def _fail_load(
|
||||
filepath: str,
|
||||
frame_offset: int = 0,
|
||||
num_frames: int = -1,
|
||||
normalize: bool = True,
|
||||
channels_first: bool = True,
|
||||
format: Optional[str] = None,
|
||||
) -> Tuple[Tensor, int]:
|
||||
raise RuntimeError("Failed to load audio from {}".format(filepath))
|
||||
|
||||
|
||||
def _fail_load_fileobj(fileobj, *args, **kwargs):
|
||||
raise RuntimeError(f"Failed to load audio from {fileobj}")
|
||||
|
||||
_fallback_info = _fail_info
|
||||
_fallback_info_fileobj = _fail_info_fileobj
|
||||
_fallback_load = _fail_load
|
||||
_fallback_load_filebj = _fail_load_fileobj
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def load(
|
||||
filepath: str,
|
||||
frame_offset: int = 0,
|
||||
num_frames: int=-1,
|
||||
normalize: bool = True,
|
||||
channels_first: bool = True,
|
||||
format: Optional[str]=None, ) -> Tuple[Tensor, int]:
|
||||
if hasattr(filepath, "read"):
|
||||
ret = paddleaudio.load_audio_fileobj(
|
||||
filepath, frame_offset, num_frames, normalize, channels_first, format
|
||||
)
|
||||
if ret is not None:
|
||||
audio_tensor = paddle.to_tensor(ret[0])
|
||||
return (audio_tensor, ret[1])
|
||||
return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format)
|
||||
filepath = os.fspath(filepath)
|
||||
ret = paddleaudio.sox_io_load_audio_file(
|
||||
filepath, frame_offset, num_frames, normalize, channels_first, format
|
||||
)
|
||||
if ret is not None:
|
||||
audio_tensor = paddle.to_tensor(ret[0])
|
||||
return (audio_tensor, ret[1])
|
||||
return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def save(filepath: str,
|
||||
src: Tensor,
|
||||
sample_rate: int,
|
||||
channels_first: bool = True,
|
||||
compression: Optional[float] = None,
|
||||
format: Optional[str] = None,
|
||||
encoding: Optional[str] = None,
|
||||
bits_per_sample: Optional[int] = None,
|
||||
):
|
||||
src_arr = src.numpy()
|
||||
if hasattr(filepath, "write"):
|
||||
paddleaudio.save_audio_fileobj(
|
||||
filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
|
||||
)
|
||||
return
|
||||
filepath = os.fspath(filepath)
|
||||
paddleaudio.sox_io_save_audio_file(
|
||||
filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
|
||||
)
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def info(filepath: str, format: Optional[str] = None,) -> AudioMetaData:
|
||||
if hasattr(filepath, "read"):
|
||||
sinfo = paddleaudio.get_info_fileobj(filepath, format)
|
||||
if sinfo is not None:
|
||||
return AudioMetaData(*sinfo)
|
||||
return _fallback_info_fileobj(filepath, format)
|
||||
filepath = os.fspath(filepath)
|
||||
sinfo = paddleaudio.get_info_file(filepath, format)
|
||||
if sinfo is not None:
|
||||
return AudioMetaData(*sinfo)
|
||||
return _fallback_info(filepath, format)
|
@ -1,93 +0,0 @@
|
||||
"""Defines utilities for switching audio backends"""
|
||||
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
|
||||
|
||||
import warnings
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
import paddlespeech.audio
|
||||
from paddlespeech.audio._internal import module_utils as _mod_utils
|
||||
|
||||
from . import no_backend, soundfile_backend, sox_io_backend
|
||||
|
||||
__all__ = [
|
||||
"list_audio_backends",
|
||||
"get_audio_backend",
|
||||
"set_audio_backend",
|
||||
]
|
||||
|
||||
|
||||
def list_audio_backends() -> List[str]:
|
||||
"""List available backends
|
||||
|
||||
Returns:
|
||||
List[str]: The list of available backends.
|
||||
"""
|
||||
backends = []
|
||||
if _mod_utils.is_module_available("soundfile"):
|
||||
backends.append("soundfile")
|
||||
if _mod_utils.is_sox_available():
|
||||
backends.append("sox_io")
|
||||
return backends
|
||||
|
||||
|
||||
def set_audio_backend(backend: Optional[str]):
|
||||
"""Set the backend for I/O operation
|
||||
|
||||
Args:
|
||||
backend (str or None): Name of the backend.
|
||||
One of ``"sox_io"`` or ``"soundfile"`` based on availability
|
||||
of the system. If ``None`` is provided the current backend is unassigned.
|
||||
"""
|
||||
if backend is not None and backend not in list_audio_backends():
|
||||
raise RuntimeError(f'Backend "{backend}" is not one of '
|
||||
f"available backends: {list_audio_backends()}.")
|
||||
|
||||
if backend is None:
|
||||
module = no_backend
|
||||
elif backend == "sox_io":
|
||||
module = sox_io_backend
|
||||
elif backend == "soundfile":
|
||||
module = soundfile_backend
|
||||
else:
|
||||
raise NotImplementedError(f'Unexpected backend "{backend}"')
|
||||
|
||||
for func in ["save", "load", "info"]:
|
||||
setattr(paddlespeech.audio, func, getattr(module, func))
|
||||
|
||||
|
||||
# def _init_audio_backend():
|
||||
# backends = list_audio_backends()
|
||||
# if "sox_io" in backends:
|
||||
# set_audio_backend("sox_io")
|
||||
# elif "soundfile" in backends:
|
||||
# set_audio_backend("soundfile")
|
||||
# else:
|
||||
# warnings.warn("No audio backend is available.")
|
||||
# set_audio_backend(None)
|
||||
|
||||
|
||||
def _init_audio_backend():
|
||||
backends = list_audio_backends()
|
||||
if "soundfile" in backends:
|
||||
set_audio_backend("soundfile")
|
||||
elif "sox_io" in backends:
|
||||
set_audio_backend("sox_io")
|
||||
else:
|
||||
warnings.warn("No audio backend is available.")
|
||||
set_audio_backend(None)
|
||||
|
||||
|
||||
def get_audio_backend() -> Optional[str]:
|
||||
"""Get the name of the current backend
|
||||
|
||||
Returns:
|
||||
Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
|
||||
"""
|
||||
if paddlespeech.audio.load == no_backend.load:
|
||||
return None
|
||||
if paddlespeech.audio.load == sox_io_backend.load:
|
||||
return "sox_io"
|
||||
if paddlespeech.audio.load == soundfile_backend.load:
|
||||
return "soundfile"
|
||||
raise ValueError("Unknown backend.")
|
@ -1,15 +0,0 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from . import kaldi
|
||||
from . import librosa
|
@ -1,638 +0,0 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from torchaudio(https://github.com/pytorch/audio)
|
||||
import math
|
||||
from typing import Tuple
|
||||
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
|
||||
from ..functional import create_dct
|
||||
from ..functional.window import get_window
|
||||
|
||||
__all__ = [
|
||||
'spectrogram',
|
||||
'fbank',
|
||||
'mfcc',
|
||||
]
|
||||
|
||||
# window types
|
||||
HANNING = 'hann'
|
||||
HAMMING = 'hamming'
|
||||
POVEY = 'povey'
|
||||
RECTANGULAR = 'rect'
|
||||
BLACKMAN = 'blackman'
|
||||
|
||||
|
||||
def _get_epsilon(dtype):
|
||||
return paddle.to_tensor(1e-07, dtype=dtype)
|
||||
|
||||
|
||||
def _next_power_of_2(x: int) -> int:
|
||||
return 1 if x == 0 else 2**(x - 1).bit_length()
|
||||
|
||||
|
||||
def _get_strided(waveform: Tensor,
|
||||
window_size: int,
|
||||
window_shift: int,
|
||||
snip_edges: bool) -> Tensor:
|
||||
assert waveform.dim() == 1
|
||||
num_samples = waveform.shape[0]
|
||||
|
||||
if snip_edges:
|
||||
if num_samples < window_size:
|
||||
return paddle.empty((0, 0), dtype=waveform.dtype)
|
||||
else:
|
||||
m = 1 + (num_samples - window_size) // window_shift
|
||||
else:
|
||||
reversed_waveform = paddle.flip(waveform, [0])
|
||||
m = (num_samples + (window_shift // 2)) // window_shift
|
||||
pad = window_size // 2 - window_shift // 2
|
||||
pad_right = reversed_waveform
|
||||
if pad > 0:
|
||||
pad_left = reversed_waveform[-pad:]
|
||||
waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
|
||||
else:
|
||||
waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
|
||||
|
||||
return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
|
||||
|
||||
|
||||
def _feature_window_function(
|
||||
window_type: str,
|
||||
window_size: int,
|
||||
blackman_coeff: float,
|
||||
dtype: int, ) -> Tensor:
|
||||
if window_type == HANNING:
|
||||
return get_window('hann', window_size, fftbins=False, dtype=dtype)
|
||||
elif window_type == HAMMING:
|
||||
return get_window('hamming', window_size, fftbins=False, dtype=dtype)
|
||||
elif window_type == POVEY:
|
||||
return get_window(
|
||||
'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
|
||||
elif window_type == RECTANGULAR:
|
||||
return paddle.ones([window_size], dtype=dtype)
|
||||
elif window_type == BLACKMAN:
|
||||
a = 2 * math.pi / (window_size - 1)
|
||||
window_function = paddle.arange(window_size, dtype=dtype)
|
||||
return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
|
||||
(0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
|
||||
).astype(dtype)
|
||||
else:
|
||||
raise Exception('Invalid window type ' + window_type)
|
||||
|
||||
|
||||
def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
|
||||
energy_floor: float) -> Tensor:
|
||||
log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
|
||||
if energy_floor == 0.0:
|
||||
return log_energy
|
||||
return paddle.maximum(
|
||||
log_energy,
|
||||
paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
|
||||
|
||||
|
||||
def _get_waveform_and_window_properties(
|
||||
waveform: Tensor,
|
||||
channel: int,
|
||||
sr: int,
|
||||
frame_shift: float,
|
||||
frame_length: float,
|
||||
round_to_power_of_two: bool,
|
||||
preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
|
||||
channel = max(channel, 0)
|
||||
assert channel < waveform.shape[0], (
|
||||
'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
|
||||
waveform = waveform[channel, :] # size (n)
|
||||
window_shift = int(
|
||||
sr * frame_shift *
|
||||
0.001) # pass frame_shift and frame_length in milliseconds
|
||||
window_size = int(sr * frame_length * 0.001)
|
||||
padded_window_size = _next_power_of_2(
|
||||
window_size) if round_to_power_of_two else window_size
|
||||
|
||||
assert 2 <= window_size <= len(waveform), (
|
||||
'choose a window size {} that is [2, {}]'.format(window_size,
|
||||
len(waveform)))
|
||||
assert 0 < window_shift, '`window_shift` must be greater than 0'
|
||||
assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
|
||||
' use `round_to_power_of_two` or change `frame_length`'
|
||||
assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
|
||||
assert sr > 0, '`sr` must be greater than zero'
|
||||
return waveform, window_shift, window_size, padded_window_size
|
||||
|
||||
|
||||
def _get_window(waveform: Tensor,
|
||||
padded_window_size: int,
|
||||
window_size: int,
|
||||
window_shift: int,
|
||||
window_type: str,
|
||||
blackman_coeff: float,
|
||||
snip_edges: bool,
|
||||
raw_energy: bool,
|
||||
energy_floor: float,
|
||||
dither: float,
|
||||
remove_dc_offset: bool,
|
||||
preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
|
||||
dtype = waveform.dtype
|
||||
epsilon = _get_epsilon(dtype)
|
||||
|
||||
# (m, window_size)
|
||||
strided_input = _get_strided(waveform, window_size, window_shift,
|
||||
snip_edges)
|
||||
|
||||
if dither != 0.0:
|
||||
x = paddle.maximum(epsilon,
|
||||
paddle.rand(strided_input.shape, dtype=dtype))
|
||||
rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
|
||||
strided_input = strided_input + rand_gauss * dither
|
||||
|
||||
if remove_dc_offset:
|
||||
row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1)
|
||||
strided_input = strided_input - row_means
|
||||
|
||||
if raw_energy:
|
||||
signal_log_energy = _get_log_energy(strided_input, epsilon,
|
||||
energy_floor) # (m)
|
||||
|
||||
if preemphasis_coefficient != 0.0:
|
||||
offset_strided_input = paddle.nn.functional.pad(
|
||||
strided_input.unsqueeze(0), (1, 0),
|
||||
data_format='NCL',
|
||||
mode='replicate').squeeze(0) # (m, window_size + 1)
|
||||
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
|
||||
-1]
|
||||
|
||||
window_function = _feature_window_function(
|
||||
window_type, window_size, blackman_coeff,
|
||||
dtype).unsqueeze(0) # (1, window_size)
|
||||
strided_input = strided_input * window_function # (m, window_size)
|
||||
|
||||
# (m, padded_window_size)
|
||||
if padded_window_size != window_size:
|
||||
padding_right = padded_window_size - window_size
|
||||
strided_input = paddle.nn.functional.pad(
|
||||
strided_input.unsqueeze(0), (0, padding_right),
|
||||
data_format='NCL',
|
||||
mode='constant',
|
||||
value=0).squeeze(0)
|
||||
|
||||
if not raw_energy:
|
||||
signal_log_energy = _get_log_energy(strided_input, epsilon,
|
||||
energy_floor) # size (m)
|
||||
|
||||
return strided_input, signal_log_energy
|
||||
|
||||
|
||||
def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
|
||||
if subtract_mean:
|
||||
col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
|
||||
tensor = tensor - col_means
|
||||
return tensor
|
||||
|
||||
|
||||
def spectrogram(waveform: Tensor,
|
||||
blackman_coeff: float=0.42,
|
||||
channel: int=-1,
|
||||
dither: float=0.0,
|
||||
energy_floor: float=1.0,
|
||||
frame_length: float=25.0,
|
||||
frame_shift: float=10.0,
|
||||
preemphasis_coefficient: float=0.97,
|
||||
raw_energy: bool=True,
|
||||
remove_dc_offset: bool=True,
|
||||
round_to_power_of_two: bool=True,
|
||||
sr: int=16000,
|
||||
snip_edges: bool=True,
|
||||
subtract_mean: bool=False,
|
||||
window_type: str=POVEY) -> Tensor:
|
||||
"""Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): A waveform tensor with shape `(C, T)`.
|
||||
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||
to FFT. Defaults to True.
|
||||
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||
|
||||
Returns:
|
||||
Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
|
||||
depends on frame_length and frame_shift.
|
||||
"""
|
||||
dtype = waveform.dtype
|
||||
epsilon = _get_epsilon(dtype)
|
||||
|
||||
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
|
||||
preemphasis_coefficient)
|
||||
|
||||
strided_input, signal_log_energy = _get_window(
|
||||
waveform, padded_window_size, window_size, window_shift, window_type,
|
||||
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
|
||||
remove_dc_offset, preemphasis_coefficient)
|
||||
|
||||
# (m, padded_window_size // 2 + 1, 2)
|
||||
fft = paddle.fft.rfft(strided_input)
|
||||
|
||||
power_spectrum = paddle.maximum(
|
||||
fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1)
|
||||
power_spectrum[:, 0] = signal_log_energy
|
||||
|
||||
power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
|
||||
return power_spectrum
|
||||
|
||||
|
||||
def _inverse_mel_scale_scalar(mel_freq: float) -> float:
|
||||
return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
|
||||
|
||||
|
||||
def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
|
||||
return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
|
||||
|
||||
|
||||
def _mel_scale_scalar(freq: float) -> float:
|
||||
return 1127.0 * math.log(1.0 + freq / 700.0)
|
||||
|
||||
|
||||
def _mel_scale(freq: Tensor) -> Tensor:
|
||||
return 1127.0 * (1.0 + freq / 700.0).log()
|
||||
|
||||
|
||||
def _vtln_warp_freq(vtln_low_cutoff: float,
|
||||
vtln_high_cutoff: float,
|
||||
low_freq: float,
|
||||
high_freq: float,
|
||||
vtln_warp_factor: float,
|
||||
freq: Tensor) -> Tensor:
|
||||
assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
|
||||
assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
|
||||
l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
|
||||
h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
|
||||
scale = 1.0 / vtln_warp_factor
|
||||
Fl = scale * l
|
||||
Fh = scale * h
|
||||
assert l > low_freq and h < high_freq
|
||||
scale_left = (Fl - low_freq) / (l - low_freq)
|
||||
scale_right = (high_freq - Fh) / (high_freq - h)
|
||||
res = paddle.empty_like(freq)
|
||||
|
||||
outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
|
||||
| paddle.greater_than(freq, paddle.to_tensor(high_freq))
|
||||
before_l = paddle.less_than(freq, paddle.to_tensor(l))
|
||||
before_h = paddle.less_than(freq, paddle.to_tensor(h))
|
||||
after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
|
||||
|
||||
res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
|
||||
res[before_h] = scale * freq[before_h]
|
||||
res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
|
||||
res[outside_low_high_freq] = freq[outside_low_high_freq]
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _vtln_warp_mel_freq(vtln_low_cutoff: float,
|
||||
vtln_high_cutoff: float,
|
||||
low_freq,
|
||||
high_freq: float,
|
||||
vtln_warp_factor: float,
|
||||
mel_freq: Tensor) -> Tensor:
|
||||
return _mel_scale(
|
||||
_vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
|
||||
vtln_warp_factor, _inverse_mel_scale(mel_freq)))
|
||||
|
||||
|
||||
def _get_mel_banks(num_bins: int,
|
||||
window_length_padded: int,
|
||||
sample_freq: float,
|
||||
low_freq: float,
|
||||
high_freq: float,
|
||||
vtln_low: float,
|
||||
vtln_high: float,
|
||||
vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
|
||||
assert num_bins > 3, 'Must have at least 3 mel bins'
|
||||
assert window_length_padded % 2 == 0
|
||||
num_fft_bins = window_length_padded / 2
|
||||
nyquist = 0.5 * sample_freq
|
||||
|
||||
if high_freq <= 0.0:
|
||||
high_freq += nyquist
|
||||
|
||||
assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
|
||||
('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
|
||||
|
||||
fft_bin_width = sample_freq / window_length_padded
|
||||
mel_low_freq = _mel_scale_scalar(low_freq)
|
||||
mel_high_freq = _mel_scale_scalar(high_freq)
|
||||
|
||||
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
|
||||
|
||||
if vtln_high < 0.0:
|
||||
vtln_high += nyquist
|
||||
|
||||
assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
|
||||
(0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
|
||||
('Bad values in options: vtln-low {} and vtln-high {}, versus '
|
||||
'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
|
||||
|
||||
bin = paddle.arange(num_bins).unsqueeze(1)
|
||||
left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1)
|
||||
center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1)
|
||||
right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1)
|
||||
|
||||
if vtln_warp_factor != 1.0:
|
||||
left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
|
||||
vtln_warp_factor, left_mel)
|
||||
center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
|
||||
high_freq, vtln_warp_factor,
|
||||
center_mel)
|
||||
right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
|
||||
high_freq, vtln_warp_factor, right_mel)
|
||||
|
||||
center_freqs = _inverse_mel_scale(center_mel) # (num_bins)
|
||||
# (1, num_fft_bins)
|
||||
mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
|
||||
|
||||
# (num_bins, num_fft_bins)
|
||||
up_slope = (mel - left_mel) / (center_mel - left_mel)
|
||||
down_slope = (right_mel - mel) / (right_mel - center_mel)
|
||||
|
||||
if vtln_warp_factor == 1.0:
|
||||
bins = paddle.maximum(
|
||||
paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
|
||||
else:
|
||||
bins = paddle.zeros_like(up_slope)
|
||||
up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
|
||||
mel, center_mel)
|
||||
down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
|
||||
mel, right_mel)
|
||||
bins[up_idx] = up_slope[up_idx]
|
||||
bins[down_idx] = down_slope[down_idx]
|
||||
|
||||
return bins, center_freqs
|
||||
|
||||
|
||||
def fbank(waveform: Tensor,
|
||||
blackman_coeff: float=0.42,
|
||||
channel: int=-1,
|
||||
dither: float=0.0,
|
||||
energy_floor: float=1.0,
|
||||
frame_length: float=25.0,
|
||||
frame_shift: float=10.0,
|
||||
high_freq: float=0.0,
|
||||
htk_compat: bool=False,
|
||||
low_freq: float=20.0,
|
||||
n_mels: int=23,
|
||||
preemphasis_coefficient: float=0.97,
|
||||
raw_energy: bool=True,
|
||||
remove_dc_offset: bool=True,
|
||||
round_to_power_of_two: bool=True,
|
||||
sr: int=16000,
|
||||
snip_edges: bool=True,
|
||||
subtract_mean: bool=False,
|
||||
use_energy: bool=False,
|
||||
use_log_fbank: bool=True,
|
||||
use_power: bool=True,
|
||||
vtln_high: float=-500.0,
|
||||
vtln_low: float=100.0,
|
||||
vtln_warp: float=1.0,
|
||||
window_type: str=POVEY) -> Tensor:
|
||||
"""Compute and return filter banks from a waveform. The output is identical to Kaldi's.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): A waveform tensor with shape `(C, T)`.
|
||||
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
|
||||
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
|
||||
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
|
||||
n_mels (int, optional): Number of output mel bins. Defaults to 23.
|
||||
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||
to FFT. Defaults to True.
|
||||
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
|
||||
use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
|
||||
use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
|
||||
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
|
||||
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
|
||||
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
|
||||
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||
|
||||
Returns:
|
||||
Tensor: A filter banks tensor with shape `(m, n_mels)`.
|
||||
"""
|
||||
dtype = waveform.dtype
|
||||
|
||||
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
|
||||
preemphasis_coefficient)
|
||||
|
||||
strided_input, signal_log_energy = _get_window(
|
||||
waveform, padded_window_size, window_size, window_shift, window_type,
|
||||
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
|
||||
remove_dc_offset, preemphasis_coefficient)
|
||||
|
||||
# (m, padded_window_size // 2 + 1)
|
||||
spectrum = paddle.fft.rfft(strided_input).abs()
|
||||
if use_power:
|
||||
spectrum = spectrum.pow(2.)
|
||||
|
||||
# (n_mels, padded_window_size // 2)
|
||||
mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
|
||||
high_freq, vtln_low, vtln_high, vtln_warp)
|
||||
mel_energies = mel_energies.astype(dtype)
|
||||
|
||||
# (n_mels, padded_window_size // 2 + 1)
|
||||
mel_energies = paddle.nn.functional.pad(
|
||||
mel_energies.unsqueeze(0), (0, 1),
|
||||
data_format='NCL',
|
||||
mode='constant',
|
||||
value=0).squeeze(0)
|
||||
|
||||
# (m, n_mels)
|
||||
mel_energies = paddle.mm(spectrum, mel_energies.T)
|
||||
if use_log_fbank:
|
||||
mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
|
||||
|
||||
if use_energy:
|
||||
signal_log_energy = signal_log_energy.unsqueeze(1)
|
||||
if htk_compat:
|
||||
mel_energies = paddle.concat(
|
||||
(mel_energies, signal_log_energy), axis=1)
|
||||
else:
|
||||
mel_energies = paddle.concat(
|
||||
(signal_log_energy, mel_energies), axis=1)
|
||||
|
||||
# (m, n_mels + 1)
|
||||
mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
|
||||
return mel_energies
|
||||
|
||||
|
||||
def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
|
||||
dct_matrix = create_dct(n_mels, n_mels, 'ortho')
|
||||
dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
|
||||
dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc)
|
||||
return dct_matrix
|
||||
|
||||
|
||||
def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
|
||||
i = paddle.arange(n_mfcc)
|
||||
return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
|
||||
cepstral_lifter)
|
||||
|
||||
|
||||
def mfcc(waveform: Tensor,
|
||||
blackman_coeff: float=0.42,
|
||||
cepstral_lifter: float=22.0,
|
||||
channel: int=-1,
|
||||
dither: float=0.0,
|
||||
energy_floor: float=1.0,
|
||||
frame_length: float=25.0,
|
||||
frame_shift: float=10.0,
|
||||
high_freq: float=0.0,
|
||||
htk_compat: bool=False,
|
||||
low_freq: float=20.0,
|
||||
n_mfcc: int=13,
|
||||
n_mels: int=23,
|
||||
preemphasis_coefficient: float=0.97,
|
||||
raw_energy: bool=True,
|
||||
remove_dc_offset: bool=True,
|
||||
round_to_power_of_two: bool=True,
|
||||
sr: int=16000,
|
||||
snip_edges: bool=True,
|
||||
subtract_mean: bool=False,
|
||||
use_energy: bool=False,
|
||||
vtln_high: float=-500.0,
|
||||
vtln_low: float=100.0,
|
||||
vtln_warp: float=1.0,
|
||||
window_type: str=POVEY) -> Tensor:
|
||||
"""Compute and return mel frequency cepstral coefficients from a waveform. The output is
|
||||
identical to Kaldi's.
|
||||
|
||||
Args:
|
||||
waveform (Tensor): A waveform tensor with shape `(C, T)`.
|
||||
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||
cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
|
||||
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
|
||||
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
|
||||
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
|
||||
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
|
||||
n_mels (int, optional): Number of output mel bins. Defaults to 23.
|
||||
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||
to FFT. Defaults to True.
|
||||
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
|
||||
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
|
||||
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
|
||||
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
|
||||
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||
|
||||
Returns:
|
||||
Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
|
||||
"""
|
||||
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
|
||||
n_mfcc, n_mels)
|
||||
|
||||
dtype = waveform.dtype
|
||||
|
||||
# (m, n_mels + use_energy)
|
||||
feature = fbank(
|
||||
waveform=waveform,
|
||||
blackman_coeff=blackman_coeff,
|
||||
channel=channel,
|
||||
dither=dither,
|
||||
energy_floor=energy_floor,
|
||||
frame_length=frame_length,
|
||||
frame_shift=frame_shift,
|
||||
high_freq=high_freq,
|
||||
htk_compat=htk_compat,
|
||||
low_freq=low_freq,
|
||||
n_mels=n_mels,
|
||||
preemphasis_coefficient=preemphasis_coefficient,
|
||||
raw_energy=raw_energy,
|
||||
remove_dc_offset=remove_dc_offset,
|
||||
round_to_power_of_two=round_to_power_of_two,
|
||||
sr=sr,
|
||||
snip_edges=snip_edges,
|
||||
subtract_mean=False,
|
||||
use_energy=use_energy,
|
||||
use_log_fbank=True,
|
||||
use_power=True,
|
||||
vtln_high=vtln_high,
|
||||
vtln_low=vtln_low,
|
||||
vtln_warp=vtln_warp,
|
||||
window_type=window_type)
|
||||
|
||||
if use_energy:
|
||||
# (m)
|
||||
signal_log_energy = feature[:, n_mels if htk_compat else 0]
|
||||
mel_offset = int(not htk_compat)
|
||||
feature = feature[:, mel_offset:(n_mels + mel_offset)]
|
||||
|
||||
# (n_mels, n_mfcc)
|
||||
dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
|
||||
|
||||
# (m, n_mfcc)
|
||||
feature = feature.matmul(dct_matrix)
|
||||
|
||||
if cepstral_lifter != 0.0:
|
||||
# (1, n_mfcc)
|
||||
lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
|
||||
feature *= lifter_coeffs.astype(dtype=dtype)
|
||||
|
||||
if use_energy:
|
||||
feature[:, 0] = signal_log_energy
|
||||
|
||||
if htk_compat:
|
||||
energy = feature[:, 0].unsqueeze(1) # (m, 1)
|
||||
feature = feature[:, 1:] # (m, n_mfcc - 1)
|
||||
if not use_energy:
|
||||
energy *= math.sqrt(2)
|
||||
|
||||
feature = paddle.concat((feature, energy), axis=1)
|
||||
|
||||
feature = _subtract_column_mean(feature, subtract_mean)
|
||||
return feature
|
@ -1,788 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from librosa(https://github.com/librosa/librosa)
|
||||
import warnings
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import scipy
|
||||
from numpy.lib.stride_tricks import as_strided
|
||||
from scipy import signal
|
||||
|
||||
from ..utils import depth_convert
|
||||
from ..utils import ParameterError
|
||||
|
||||
__all__ = [
|
||||
# dsp
|
||||
'stft',
|
||||
'mfcc',
|
||||
'hz_to_mel',
|
||||
'mel_to_hz',
|
||||
'mel_frequencies',
|
||||
'power_to_db',
|
||||
'compute_fbank_matrix',
|
||||
'melspectrogram',
|
||||
'spectrogram',
|
||||
'mu_encode',
|
||||
'mu_decode',
|
||||
# augmentation
|
||||
'depth_augment',
|
||||
'spect_augment',
|
||||
'random_crop1d',
|
||||
'random_crop2d',
|
||||
'adaptive_spect_augment',
|
||||
]
|
||||
|
||||
|
||||
def _pad_center(data: np.ndarray, size: int, axis: int=-1,
|
||||
**kwargs) -> np.ndarray:
|
||||
"""Pad an array to a target length along a target axis.
|
||||
|
||||
This differs from `np.pad` by centering the data prior to padding,
|
||||
analogous to `str.center`
|
||||
"""
|
||||
|
||||
kwargs.setdefault("mode", "constant")
|
||||
n = data.shape[axis]
|
||||
lpad = int((size - n) // 2)
|
||||
lengths = [(0, 0)] * data.ndim
|
||||
lengths[axis] = (lpad, int(size - n - lpad))
|
||||
|
||||
if lpad < 0:
|
||||
raise ParameterError(("Target size ({size:d}) must be "
|
||||
"at least input size ({n:d})"))
|
||||
|
||||
return np.pad(data, lengths, **kwargs)
|
||||
|
||||
|
||||
def _split_frames(x: np.ndarray,
|
||||
frame_length: int,
|
||||
hop_length: int,
|
||||
axis: int=-1) -> np.ndarray:
|
||||
"""Slice a data array into (overlapping) frames.
|
||||
|
||||
This function is aligned with librosa.frame
|
||||
"""
|
||||
|
||||
if not isinstance(x, np.ndarray):
|
||||
raise ParameterError(
|
||||
f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
|
||||
|
||||
if x.shape[axis] < frame_length:
|
||||
raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
|
||||
f" for frame_length={frame_length:d}")
|
||||
|
||||
if hop_length < 1:
|
||||
raise ParameterError(f"Invalid hop_length: {hop_length:d}")
|
||||
|
||||
if axis == -1 and not x.flags["F_CONTIGUOUS"]:
|
||||
warnings.warn(f"librosa.util.frame called with axis={axis} "
|
||||
"on a non-contiguous input. This will result in a copy.")
|
||||
x = np.asfortranarray(x)
|
||||
elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
|
||||
warnings.warn(f"librosa.util.frame called with axis={axis} "
|
||||
"on a non-contiguous input. This will result in a copy.")
|
||||
x = np.ascontiguousarray(x)
|
||||
|
||||
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
|
||||
strides = np.asarray(x.strides)
|
||||
|
||||
new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
|
||||
|
||||
if axis == -1:
|
||||
shape = list(x.shape)[:-1] + [frame_length, n_frames]
|
||||
strides = list(strides) + [hop_length * new_stride]
|
||||
|
||||
elif axis == 0:
|
||||
shape = [n_frames, frame_length] + list(x.shape)[1:]
|
||||
strides = [hop_length * new_stride] + list(strides)
|
||||
|
||||
else:
|
||||
raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
|
||||
|
||||
return as_strided(x, shape=shape, strides=strides)
|
||||
|
||||
|
||||
def _check_audio(y, mono=True) -> bool:
|
||||
"""Determine whether a variable contains valid audio data.
|
||||
|
||||
The audio y must be a np.ndarray, ether 1-channel or two channel
|
||||
"""
|
||||
if not isinstance(y, np.ndarray):
|
||||
raise ParameterError("Audio data must be of type numpy.ndarray")
|
||||
if y.ndim > 2:
|
||||
raise ParameterError(
|
||||
f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
|
||||
|
||||
if mono and y.ndim == 2:
|
||||
raise ParameterError(
|
||||
f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
|
||||
|
||||
if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
|
||||
raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
|
||||
|
||||
if not np.issubdtype(y.dtype, np.floating):
|
||||
raise ParameterError("Audio data must be floating-point")
|
||||
|
||||
if not np.isfinite(y).all():
|
||||
raise ParameterError("Audio buffer is not finite everywhere")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
|
||||
htk: bool=False) -> np.ndarray:
|
||||
"""Convert Hz to Mels.
|
||||
|
||||
Args:
|
||||
frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
|
||||
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Frequency in mels.
|
||||
"""
|
||||
freq = np.asanyarray(frequencies)
|
||||
|
||||
if htk:
|
||||
return 2595.0 * np.log10(1.0 + freq / 700.0)
|
||||
|
||||
# Fill in the linear part
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
|
||||
mels = (freq - f_min) / f_sp
|
||||
|
||||
# Fill in the log-scale part
|
||||
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = np.log(6.4) / 27.0 # step size for log region
|
||||
|
||||
if freq.ndim:
|
||||
# If we have array data, vectorize
|
||||
log_t = freq >= min_log_hz
|
||||
mels[log_t] = min_log_mel + \
|
||||
np.log(freq[log_t] / min_log_hz) / logstep
|
||||
elif freq >= min_log_hz:
|
||||
# If we have scalar data, heck directly
|
||||
mels = min_log_mel + np.log(freq / min_log_hz) / logstep
|
||||
|
||||
return mels
|
||||
|
||||
|
||||
def mel_to_hz(mels: Union[float, List[float], np.ndarray],
|
||||
htk: int=False) -> np.ndarray:
|
||||
"""Convert mel bin numbers to frequencies.
|
||||
|
||||
Args:
|
||||
mels (Union[float, List[float], np.ndarray]): Frequency in mels.
|
||||
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Frequencies in Hz.
|
||||
"""
|
||||
mel_array = np.asanyarray(mels)
|
||||
|
||||
if htk:
|
||||
return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
|
||||
|
||||
# Fill in the linear scale
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
freqs = f_min + f_sp * mel_array
|
||||
|
||||
# And now the nonlinear scale
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = np.log(6.4) / 27.0 # step size for log region
|
||||
|
||||
if mel_array.ndim:
|
||||
# If we have vector data, vectorize
|
||||
log_t = mel_array >= min_log_mel
|
||||
freqs[log_t] = min_log_hz * \
|
||||
np.exp(logstep * (mel_array[log_t] - min_log_mel))
|
||||
elif mel_array >= min_log_mel:
|
||||
# If we have scalar data, check directly
|
||||
freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
|
||||
|
||||
return freqs
|
||||
|
||||
|
||||
def mel_frequencies(n_mels: int=128,
|
||||
fmin: float=0.0,
|
||||
fmax: float=11025.0,
|
||||
htk: bool=False) -> np.ndarray:
|
||||
"""Compute mel frequencies.
|
||||
|
||||
Args:
|
||||
n_mels (int, optional): Number of mel bins. Defaults to 128.
|
||||
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
|
||||
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
|
||||
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
|
||||
"""
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
min_mel = hz_to_mel(fmin, htk=htk)
|
||||
max_mel = hz_to_mel(fmax, htk=htk)
|
||||
|
||||
mels = np.linspace(min_mel, max_mel, n_mels)
|
||||
|
||||
return mel_to_hz(mels, htk=htk)
|
||||
|
||||
|
||||
def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
|
||||
"""Compute fourier frequencies.
|
||||
|
||||
Args:
|
||||
sr (int): Sample rate.
|
||||
n_fft (int): FFT size.
|
||||
|
||||
Returns:
|
||||
np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
|
||||
"""
|
||||
return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
|
||||
|
||||
|
||||
def compute_fbank_matrix(sr: int,
|
||||
n_fft: int,
|
||||
n_mels: int=128,
|
||||
fmin: float=0.0,
|
||||
fmax: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: str="slaney",
|
||||
dtype: type=np.float32) -> np.ndarray:
|
||||
"""Compute fbank matrix.
|
||||
|
||||
Args:
|
||||
sr (int): Sample rate.
|
||||
n_fft (int): FFT size.
|
||||
n_mels (int, optional): Number of mel bins. Defaults to 128.
|
||||
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
|
||||
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||
norm (str, optional): Type of normalization. Defaults to "slaney".
|
||||
dtype (type, optional): Data type. Defaults to np.float32.
|
||||
|
||||
|
||||
Returns:
|
||||
np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
|
||||
"""
|
||||
if norm != "slaney":
|
||||
raise ParameterError('norm must set to slaney')
|
||||
|
||||
if fmax is None:
|
||||
fmax = float(sr) / 2
|
||||
|
||||
# Initialize the weights
|
||||
n_mels = int(n_mels)
|
||||
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
||||
|
||||
# Center freqs of each FFT bin
|
||||
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
|
||||
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
|
||||
|
||||
fdiff = np.diff(mel_f)
|
||||
ramps = np.subtract.outer(mel_f, fftfreqs)
|
||||
|
||||
for i in range(n_mels):
|
||||
# lower and upper slopes for all bins
|
||||
lower = -ramps[i] / fdiff[i]
|
||||
upper = ramps[i + 2] / fdiff[i + 1]
|
||||
|
||||
# .. then intersect them with each other and zero
|
||||
weights[i] = np.maximum(0, np.minimum(lower, upper))
|
||||
|
||||
if norm == "slaney":
|
||||
# Slaney-style mel is scaled to be approx constant energy per channel
|
||||
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
|
||||
weights *= enorm[:, np.newaxis]
|
||||
|
||||
# Only check weights if f_mel[0] is positive
|
||||
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
|
||||
# This means we have an empty channel somewhere
|
||||
warnings.warn("Empty filters detected in mel frequency basis. "
|
||||
"Some channels will produce empty responses. "
|
||||
"Try increasing your sampling rate (and fmax) or "
|
||||
"reducing n_mels.")
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def stft(x: np.ndarray,
|
||||
n_fft: int=2048,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str="hann",
|
||||
center: bool=True,
|
||||
dtype: type=np.complex64,
|
||||
pad_mode: str="reflect") -> np.ndarray:
|
||||
"""Short-time Fourier transform (STFT).
|
||||
|
||||
Args:
|
||||
x (np.ndarray): Input waveform in one dimension.
|
||||
n_fft (int, optional): FFT size. Defaults to 2048.
|
||||
hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
|
||||
win_length (Optional[int], optional): The size of window. Defaults to None.
|
||||
window (str, optional): A string of window specification. Defaults to "hann".
|
||||
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||
dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
|
||||
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
|
||||
|
||||
Returns:
|
||||
np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
|
||||
"""
|
||||
_check_audio(x)
|
||||
|
||||
# By default, use the entire frame
|
||||
if win_length is None:
|
||||
win_length = n_fft
|
||||
|
||||
# Set the default hop, if it's not already specified
|
||||
if hop_length is None:
|
||||
hop_length = int(win_length // 4)
|
||||
|
||||
fft_window = signal.get_window(window, win_length, fftbins=True)
|
||||
|
||||
# Pad the window out to n_fft size
|
||||
fft_window = _pad_center(fft_window, n_fft)
|
||||
|
||||
# Reshape so that the window can be broadcast
|
||||
fft_window = fft_window.reshape((-1, 1))
|
||||
|
||||
# Pad the time series so that frames are centered
|
||||
if center:
|
||||
if n_fft > x.shape[-1]:
|
||||
warnings.warn(
|
||||
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
|
||||
)
|
||||
x = np.pad(x, int(n_fft // 2), mode=pad_mode)
|
||||
|
||||
elif n_fft > x.shape[-1]:
|
||||
raise ParameterError(
|
||||
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
|
||||
)
|
||||
|
||||
# Window the time series.
|
||||
x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
|
||||
# Pre-allocate the STFT matrix
|
||||
stft_matrix = np.empty(
|
||||
(int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
|
||||
fft = np.fft # use numpy fft as default
|
||||
# Constrain STFT block sizes to 256 KB
|
||||
MAX_MEM_BLOCK = 2**8 * 2**10
|
||||
# how many columns can we fit within MAX_MEM_BLOCK?
|
||||
n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
|
||||
n_columns = max(n_columns, 1)
|
||||
|
||||
for bl_s in range(0, stft_matrix.shape[1], n_columns):
|
||||
bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
|
||||
stft_matrix[:, bl_s:bl_t] = fft.rfft(
|
||||
fft_window * x_frames[:, bl_s:bl_t], axis=0)
|
||||
|
||||
return stft_matrix
|
||||
|
||||
|
||||
def power_to_db(spect: np.ndarray,
|
||||
ref: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=80.0) -> np.ndarray:
|
||||
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
|
||||
|
||||
Args:
|
||||
spect (np.ndarray): STFT power spectrogram of an input waveform.
|
||||
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||
amin (float, optional): Minimum threshold. Defaults to 1e-10.
|
||||
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Power spectrogram in db scale.
|
||||
"""
|
||||
spect = np.asarray(spect)
|
||||
|
||||
if amin <= 0:
|
||||
raise ParameterError("amin must be strictly positive")
|
||||
|
||||
if np.issubdtype(spect.dtype, np.complexfloating):
|
||||
warnings.warn(
|
||||
"power_to_db was called on complex input so phase "
|
||||
"information will be discarded. To suppress this warning, "
|
||||
"call power_to_db(np.abs(D)**2) instead.")
|
||||
magnitude = np.abs(spect)
|
||||
else:
|
||||
magnitude = spect
|
||||
|
||||
if callable(ref):
|
||||
# User supplied a function to calculate reference power
|
||||
ref_value = ref(magnitude)
|
||||
else:
|
||||
ref_value = np.abs(ref)
|
||||
|
||||
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
|
||||
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
|
||||
|
||||
if top_db is not None:
|
||||
if top_db < 0:
|
||||
raise ParameterError("top_db must be non-negative")
|
||||
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
|
||||
|
||||
return log_spec
|
||||
|
||||
|
||||
def mfcc(x: np.ndarray,
|
||||
sr: int=16000,
|
||||
spect: Optional[np.ndarray]=None,
|
||||
n_mfcc: int=20,
|
||||
dct_type: int=2,
|
||||
norm: str="ortho",
|
||||
lifter: int=0,
|
||||
**kwargs) -> np.ndarray:
|
||||
"""Mel-frequency cepstral coefficients (MFCCs)
|
||||
|
||||
Args:
|
||||
x (np.ndarray): Input waveform in one dimension.
|
||||
sr (int, optional): Sample rate. Defaults to 16000.
|
||||
spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
|
||||
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
|
||||
dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
|
||||
norm (str, optional): Type of normalization. Defaults to "ortho".
|
||||
lifter (int, optional): Cepstral filtering. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
|
||||
"""
|
||||
if spect is None:
|
||||
spect = melspectrogram(x, sr=sr, **kwargs)
|
||||
|
||||
M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
|
||||
|
||||
if lifter > 0:
|
||||
factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
|
||||
lifter)
|
||||
return M * factor[:, np.newaxis]
|
||||
elif lifter == 0:
|
||||
return M
|
||||
else:
|
||||
raise ParameterError(
|
||||
f"MFCC lifter={lifter} must be a non-negative number")
|
||||
|
||||
|
||||
def melspectrogram(x: np.ndarray,
|
||||
sr: int=16000,
|
||||
window_size: int=512,
|
||||
hop_length: int=320,
|
||||
n_mels: int=64,
|
||||
fmin: float=50.0,
|
||||
fmax: Optional[float]=None,
|
||||
window: str='hann',
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
power: float=2.0,
|
||||
to_db: bool=True,
|
||||
ref: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=None) -> np.ndarray:
|
||||
"""Compute mel-spectrogram.
|
||||
|
||||
Args:
|
||||
x (np.ndarray): Input waveform in one dimension.
|
||||
sr (int, optional): Sample rate. Defaults to 16000.
|
||||
window_size (int, optional): Size of FFT and window length. Defaults to 512.
|
||||
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
|
||||
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||
fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
|
||||
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||
window (str, optional): A string of window specification. Defaults to "hann".
|
||||
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
|
||||
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
|
||||
to_db (bool, optional): Enable db scale. Defaults to True.
|
||||
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||
amin (float, optional): Minimum threshold. Defaults to 1e-10.
|
||||
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
|
||||
|
||||
Returns:
|
||||
np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
|
||||
"""
|
||||
_check_audio(x, mono=True)
|
||||
if len(x) <= 0:
|
||||
raise ParameterError('The input waveform is empty')
|
||||
|
||||
if fmax is None:
|
||||
fmax = sr // 2
|
||||
if fmin < 0 or fmin >= fmax:
|
||||
raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
|
||||
|
||||
s = stft(
|
||||
x,
|
||||
n_fft=window_size,
|
||||
hop_length=hop_length,
|
||||
win_length=window_size,
|
||||
window=window,
|
||||
center=center,
|
||||
pad_mode=pad_mode)
|
||||
|
||||
spect_power = np.abs(s)**power
|
||||
fb_matrix = compute_fbank_matrix(
|
||||
sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
||||
mel_spect = np.matmul(fb_matrix, spect_power)
|
||||
if to_db:
|
||||
return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
|
||||
else:
|
||||
return mel_spect
|
||||
|
||||
|
||||
def spectrogram(x: np.ndarray,
|
||||
sr: int=16000,
|
||||
window_size: int=512,
|
||||
hop_length: int=320,
|
||||
window: str='hann',
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
power: float=2.0) -> np.ndarray:
|
||||
"""Compute spectrogram.
|
||||
|
||||
Args:
|
||||
x (np.ndarray): Input waveform in one dimension.
|
||||
sr (int, optional): Sample rate. Defaults to 16000.
|
||||
window_size (int, optional): Size of FFT and window length. Defaults to 512.
|
||||
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
|
||||
window (str, optional): A string of window specification. Defaults to "hann".
|
||||
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
|
||||
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
|
||||
|
||||
Returns:
|
||||
np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
|
||||
"""
|
||||
|
||||
s = stft(
|
||||
x,
|
||||
n_fft=window_size,
|
||||
hop_length=hop_length,
|
||||
win_length=window_size,
|
||||
window=window,
|
||||
center=center,
|
||||
pad_mode=pad_mode)
|
||||
|
||||
return np.abs(s)**power
|
||||
|
||||
|
||||
def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
|
||||
"""Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
|
||||
|
||||
Args:
|
||||
x (np.ndarray): The input waveform to encode.
|
||||
mu (int, optional): The endoceding parameter. Defaults to 255.
|
||||
quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
|
||||
|
||||
Returns:
|
||||
np.ndarray: The mu-law encoded waveform.
|
||||
"""
|
||||
mu = 255
|
||||
y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
|
||||
if quantized:
|
||||
y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1]
|
||||
return y
|
||||
|
||||
|
||||
def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
|
||||
"""Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
|
||||
|
||||
Args:
|
||||
y (np.ndarray): The encoded waveform.
|
||||
mu (int, optional): The endoceding parameter. Defaults to 255.
|
||||
quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
|
||||
|
||||
Returns:
|
||||
np.ndarray: The mu-law decoded waveform.
|
||||
"""
|
||||
if mu < 1:
|
||||
raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
|
||||
|
||||
mu = mu - 1
|
||||
if quantized: # undo the quantization
|
||||
y = y * 2 / mu - 1
|
||||
x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
|
||||
return x
|
||||
|
||||
|
||||
def _randint(high: int) -> int:
|
||||
"""Generate one random integer in range [0 high)
|
||||
|
||||
This is a helper function for random data augmentaiton
|
||||
"""
|
||||
return int(np.random.randint(0, high=high))
|
||||
|
||||
|
||||
def depth_augment(y: np.ndarray,
|
||||
choices: List=['int8', 'int16'],
|
||||
probs: List[float]=[0.5, 0.5]) -> np.ndarray:
|
||||
""" Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
|
||||
|
||||
Args:
|
||||
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||
choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
|
||||
probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
|
||||
|
||||
Returns:
|
||||
np.ndarray: The augmented waveform.
|
||||
"""
|
||||
assert len(probs) == len(
|
||||
choices
|
||||
), 'number of choices {} must be equal to size of probs {}'.format(
|
||||
len(choices), len(probs))
|
||||
depth = np.random.choice(choices, p=probs)
|
||||
src_depth = y.dtype
|
||||
y1 = depth_convert(y, depth)
|
||||
y2 = depth_convert(y1, src_depth)
|
||||
|
||||
return y2
|
||||
|
||||
|
||||
def adaptive_spect_augment(spect: np.ndarray,
|
||||
tempo_axis: int=0,
|
||||
level: float=0.1) -> np.ndarray:
|
||||
"""Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
|
||||
|
||||
Args:
|
||||
spect (np.ndarray): Input spectrogram.
|
||||
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
|
||||
level (float, optional): The level factor of masking. Defaults to 0.1.
|
||||
|
||||
Returns:
|
||||
np.ndarray: The augmented spectrogram.
|
||||
"""
|
||||
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
||||
if tempo_axis == 0:
|
||||
nt, nf = spect.shape
|
||||
else:
|
||||
nf, nt = spect.shape
|
||||
|
||||
time_mask_width = int(nt * level * 0.5)
|
||||
freq_mask_width = int(nf * level * 0.5)
|
||||
|
||||
num_time_mask = int(10 * level)
|
||||
num_freq_mask = int(10 * level)
|
||||
|
||||
if tempo_axis == 0:
|
||||
for _ in range(num_time_mask):
|
||||
start = _randint(nt - time_mask_width)
|
||||
spect[start:start + time_mask_width, :] = 0
|
||||
for _ in range(num_freq_mask):
|
||||
start = _randint(nf - freq_mask_width)
|
||||
spect[:, start:start + freq_mask_width] = 0
|
||||
else:
|
||||
for _ in range(num_time_mask):
|
||||
start = _randint(nt - time_mask_width)
|
||||
spect[:, start:start + time_mask_width] = 0
|
||||
for _ in range(num_freq_mask):
|
||||
start = _randint(nf - freq_mask_width)
|
||||
spect[start:start + freq_mask_width, :] = 0
|
||||
|
||||
return spect
|
||||
|
||||
|
||||
def spect_augment(spect: np.ndarray,
|
||||
tempo_axis: int=0,
|
||||
max_time_mask: int=3,
|
||||
max_freq_mask: int=3,
|
||||
max_time_mask_width: int=30,
|
||||
max_freq_mask_width: int=20) -> np.ndarray:
|
||||
"""Do spectrogram augmentation in both time and freq axis.
|
||||
|
||||
Args:
|
||||
spect (np.ndarray): Input spectrogram.
|
||||
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
|
||||
max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
|
||||
max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
|
||||
max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
|
||||
max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
|
||||
|
||||
Returns:
|
||||
np.ndarray: The augmented spectrogram.
|
||||
"""
|
||||
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
||||
if tempo_axis == 0:
|
||||
nt, nf = spect.shape
|
||||
else:
|
||||
nf, nt = spect.shape
|
||||
|
||||
num_time_mask = _randint(max_time_mask)
|
||||
num_freq_mask = _randint(max_freq_mask)
|
||||
|
||||
time_mask_width = _randint(max_time_mask_width)
|
||||
freq_mask_width = _randint(max_freq_mask_width)
|
||||
|
||||
if tempo_axis == 0:
|
||||
for _ in range(num_time_mask):
|
||||
start = _randint(nt - time_mask_width)
|
||||
spect[start:start + time_mask_width, :] = 0
|
||||
for _ in range(num_freq_mask):
|
||||
start = _randint(nf - freq_mask_width)
|
||||
spect[:, start:start + freq_mask_width] = 0
|
||||
else:
|
||||
for _ in range(num_time_mask):
|
||||
start = _randint(nt - time_mask_width)
|
||||
spect[:, start:start + time_mask_width] = 0
|
||||
for _ in range(num_freq_mask):
|
||||
start = _randint(nf - freq_mask_width)
|
||||
spect[start:start + freq_mask_width, :] = 0
|
||||
|
||||
return spect
|
||||
|
||||
|
||||
def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
|
||||
""" Random cropping on a input waveform.
|
||||
|
||||
Args:
|
||||
y (np.ndarray): Input waveform array in 1D.
|
||||
crop_len (int): Length of waveform to crop.
|
||||
|
||||
Returns:
|
||||
np.ndarray: The cropped waveform.
|
||||
"""
|
||||
if y.ndim != 1:
|
||||
'only accept 1d tensor or numpy array'
|
||||
n = len(y)
|
||||
idx = _randint(n - crop_len)
|
||||
return y[idx:idx + crop_len]
|
||||
|
||||
|
||||
def random_crop2d(s: np.ndarray, crop_len: int,
|
||||
tempo_axis: int=0) -> np.ndarray:
|
||||
""" Random cropping on a spectrogram.
|
||||
|
||||
Args:
|
||||
s (np.ndarray): Input spectrogram in 2D.
|
||||
crop_len (int): Length of spectrogram to crop.
|
||||
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
np.ndarray: The cropped spectrogram.
|
||||
"""
|
||||
if tempo_axis >= s.ndim:
|
||||
raise ParameterError('axis out of range')
|
||||
|
||||
n = s.shape[tempo_axis]
|
||||
idx = _randint(high=n - crop_len)
|
||||
sli = [slice(None) for i in range(s.ndim)]
|
||||
sli[tempo_axis] = slice(idx, idx + crop_len)
|
||||
out = s[tuple(sli)]
|
||||
return out
|
@ -1,20 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .esc50 import ESC50
|
||||
from .gtzan import GTZAN
|
||||
from .hey_snips import HeySnips
|
||||
from .rirs_noises import OpenRIRNoise
|
||||
from .tess import TESS
|
||||
from .urban_sound import UrbanSound8K
|
||||
from .voxceleb import VoxCeleb
|
@ -1,100 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from ..compliance.kaldi import fbank as kaldi_fbank
|
||||
from ..compliance.kaldi import mfcc as kaldi_mfcc
|
||||
from ..compliance.librosa import melspectrogram
|
||||
from ..compliance.librosa import mfcc
|
||||
|
||||
feat_funcs = {
|
||||
'raw': None,
|
||||
'melspectrogram': melspectrogram,
|
||||
'mfcc': mfcc,
|
||||
'kaldi_fbank': kaldi_fbank,
|
||||
'kaldi_mfcc': kaldi_mfcc,
|
||||
}
|
||||
|
||||
|
||||
class AudioClassificationDataset(paddle.io.Dataset):
|
||||
"""
|
||||
Base class of audio classification dataset.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
files: List[str],
|
||||
labels: List[int],
|
||||
feat_type: str='raw',
|
||||
sample_rate: int=None,
|
||||
**kwargs):
|
||||
"""
|
||||
Ags:
|
||||
files (:obj:`List[str]`): A list of absolute path of audio files.
|
||||
labels (:obj:`List[int]`): Labels of audio files.
|
||||
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||
It identifies the feature type that user wants to extrace of an audio file.
|
||||
"""
|
||||
super(AudioClassificationDataset, self).__init__()
|
||||
|
||||
if feat_type not in feat_funcs.keys():
|
||||
raise RuntimeError(
|
||||
f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
|
||||
)
|
||||
|
||||
self.files = files
|
||||
self.labels = labels
|
||||
|
||||
self.feat_type = feat_type
|
||||
self.sample_rate = sample_rate
|
||||
self.feat_config = kwargs # Pass keyword arguments to customize feature config
|
||||
|
||||
def _get_data(self, input_file: str):
|
||||
raise NotImplementedError
|
||||
|
||||
def _convert_to_record(self, idx):
|
||||
file, label = self.files[idx], self.labels[idx]
|
||||
|
||||
if self.sample_rate is None:
|
||||
waveform, sample_rate = paddlespeech.audio.load(file)
|
||||
else:
|
||||
waveform, sample_rate = paddlespeech.audio.load(
|
||||
file, sr=self.sample_rate)
|
||||
|
||||
feat_func = feat_funcs[self.feat_type]
|
||||
|
||||
record = {}
|
||||
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
|
||||
waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T)
|
||||
record['feat'] = feat_func(
|
||||
waveform=waveform, sr=self.sample_rate, **self.feat_config)
|
||||
else:
|
||||
record['feat'] = feat_func(
|
||||
waveform, sample_rate,
|
||||
**self.feat_config) if feat_func else waveform
|
||||
record['label'] = label
|
||||
return record
|
||||
|
||||
def __getitem__(self, idx):
|
||||
record = self._convert_to_record(idx)
|
||||
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
|
||||
return self.keys[idx], record['feat'], record['label']
|
||||
else:
|
||||
return np.array(record['feat']).transpose(), np.array(
|
||||
record['label'], dtype=np.int64)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.files)
|
@ -1,152 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import collections
|
||||
import os
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
from ..utils import DATA_HOME
|
||||
from ..utils.download import download_and_decompress
|
||||
from .dataset import AudioClassificationDataset
|
||||
|
||||
__all__ = ['ESC50']
|
||||
|
||||
|
||||
class ESC50(AudioClassificationDataset):
|
||||
"""
|
||||
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
|
||||
suitable for benchmarking methods of environmental sound classification. The dataset
|
||||
consists of 5-second-long recordings organized into 50 semantical classes (with
|
||||
40 examples per class)
|
||||
|
||||
Reference:
|
||||
ESC: Dataset for Environmental Sound Classification
|
||||
http://dx.doi.org/10.1145/2733373.2806390
|
||||
"""
|
||||
|
||||
archieves = [
|
||||
{
|
||||
'url':
|
||||
'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
|
||||
'md5': '7771e4b9d86d0945acce719c7a59305a',
|
||||
},
|
||||
]
|
||||
label_list = [
|
||||
# Animals
|
||||
'Dog',
|
||||
'Rooster',
|
||||
'Pig',
|
||||
'Cow',
|
||||
'Frog',
|
||||
'Cat',
|
||||
'Hen',
|
||||
'Insects (flying)',
|
||||
'Sheep',
|
||||
'Crow',
|
||||
# Natural soundscapes & water sounds
|
||||
'Rain',
|
||||
'Sea waves',
|
||||
'Crackling fire',
|
||||
'Crickets',
|
||||
'Chirping birds',
|
||||
'Water drops',
|
||||
'Wind',
|
||||
'Pouring water',
|
||||
'Toilet flush',
|
||||
'Thunderstorm',
|
||||
# Human, non-speech sounds
|
||||
'Crying baby',
|
||||
'Sneezing',
|
||||
'Clapping',
|
||||
'Breathing',
|
||||
'Coughing',
|
||||
'Footsteps',
|
||||
'Laughing',
|
||||
'Brushing teeth',
|
||||
'Snoring',
|
||||
'Drinking, sipping',
|
||||
# Interior/domestic sounds
|
||||
'Door knock',
|
||||
'Mouse click',
|
||||
'Keyboard typing',
|
||||
'Door, wood creaks',
|
||||
'Can opening',
|
||||
'Washing machine',
|
||||
'Vacuum cleaner',
|
||||
'Clock alarm',
|
||||
'Clock tick',
|
||||
'Glass breaking',
|
||||
# Exterior/urban noises
|
||||
'Helicopter',
|
||||
'Chainsaw',
|
||||
'Siren',
|
||||
'Car horn',
|
||||
'Engine',
|
||||
'Train',
|
||||
'Church bells',
|
||||
'Airplane',
|
||||
'Fireworks',
|
||||
'Hand saw',
|
||||
]
|
||||
meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
|
||||
meta_info = collections.namedtuple(
|
||||
'META_INFO',
|
||||
('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
|
||||
audio_path = os.path.join('ESC-50-master', 'audio')
|
||||
|
||||
def __init__(self,
|
||||
mode: str='train',
|
||||
split: int=1,
|
||||
feat_type: str='raw',
|
||||
**kwargs):
|
||||
"""
|
||||
Ags:
|
||||
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||
It identifies the dataset mode (train or dev).
|
||||
split (:obj:`int`, `optional`, defaults to 1):
|
||||
It specify the fold of dev dataset.
|
||||
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||
It identifies the feature type that user wants to extrace of an audio file.
|
||||
"""
|
||||
files, labels = self._get_data(mode, split)
|
||||
super(ESC50, self).__init__(
|
||||
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||
|
||||
def _get_meta_info(self) -> List[collections.namedtuple]:
|
||||
ret = []
|
||||
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
|
||||
for line in rf.readlines()[1:]:
|
||||
ret.append(self.meta_info(*line.strip().split(',')))
|
||||
return ret
|
||||
|
||||
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
|
||||
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||
download_and_decompress(self.archieves, DATA_HOME)
|
||||
|
||||
meta_info = self._get_meta_info()
|
||||
|
||||
files = []
|
||||
labels = []
|
||||
for sample in meta_info:
|
||||
filename, fold, target, _, _, _, _ = sample
|
||||
if mode == 'train' and int(fold) != split:
|
||||
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
||||
labels.append(int(target))
|
||||
|
||||
if mode != 'train' and int(fold) == split:
|
||||
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
||||
labels.append(int(target))
|
||||
|
||||
return files, labels
|
@ -1,115 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import collections
|
||||
import os
|
||||
import random
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
from ..utils import DATA_HOME
|
||||
from ..utils.download import download_and_decompress
|
||||
from .dataset import AudioClassificationDataset
|
||||
|
||||
__all__ = ['GTZAN']
|
||||
|
||||
|
||||
class GTZAN(AudioClassificationDataset):
|
||||
"""
|
||||
The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
|
||||
each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
|
||||
in machine listening research for music genre recognition (MGR).
|
||||
|
||||
Reference:
|
||||
Musical genre classification of audio signals
|
||||
https://ieeexplore.ieee.org/document/1021072/
|
||||
"""
|
||||
|
||||
archieves = [
|
||||
{
|
||||
'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
|
||||
'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
|
||||
},
|
||||
]
|
||||
label_list = [
|
||||
'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
|
||||
'pop', 'reggae', 'rock'
|
||||
]
|
||||
meta = os.path.join('genres', 'input.mf')
|
||||
meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
|
||||
audio_path = 'genres'
|
||||
|
||||
def __init__(self,
|
||||
mode='train',
|
||||
seed=0,
|
||||
n_folds=5,
|
||||
split=1,
|
||||
feat_type='raw',
|
||||
**kwargs):
|
||||
"""
|
||||
Ags:
|
||||
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||
It identifies the dataset mode (train or dev).
|
||||
seed (:obj:`int`, `optional`, defaults to 0):
|
||||
Set the random seed to shuffle samples.
|
||||
n_folds (:obj:`int`, `optional`, defaults to 5):
|
||||
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
|
||||
split (:obj:`int`, `optional`, defaults to 1):
|
||||
It specify the fold of dev dataset.
|
||||
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||
It identifies the feature type that user wants to extrace of an audio file.
|
||||
"""
|
||||
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
|
||||
files, labels = self._get_data(mode, seed, n_folds, split)
|
||||
super(GTZAN, self).__init__(
|
||||
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||
|
||||
def _get_meta_info(self) -> List[collections.namedtuple]:
|
||||
ret = []
|
||||
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
|
||||
for line in rf.readlines():
|
||||
ret.append(self.meta_info(*line.strip().split('\t')))
|
||||
return ret
|
||||
|
||||
def _get_data(self, mode, seed, n_folds,
|
||||
split) -> Tuple[List[str], List[int]]:
|
||||
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||
download_and_decompress(self.archieves, DATA_HOME)
|
||||
|
||||
meta_info = self._get_meta_info()
|
||||
random.seed(seed) # shuffle samples to split data
|
||||
random.shuffle(
|
||||
meta_info
|
||||
) # make sure using the same seed to create train and dev dataset
|
||||
|
||||
files = []
|
||||
labels = []
|
||||
n_samples_per_fold = len(meta_info) // n_folds
|
||||
for idx, sample in enumerate(meta_info):
|
||||
file_path, label = sample
|
||||
filename = os.path.basename(file_path)
|
||||
target = self.label_list.index(label)
|
||||
fold = idx // n_samples_per_fold + 1
|
||||
|
||||
if mode == 'train' and int(fold) != split:
|
||||
files.append(
|
||||
os.path.join(DATA_HOME, self.audio_path, label, filename))
|
||||
labels.append(target)
|
||||
|
||||
if mode != 'train' and int(fold) == split:
|
||||
files.append(
|
||||
os.path.join(DATA_HOME, self.audio_path, label, filename))
|
||||
labels.append(target)
|
||||
|
||||
return files, labels
|
@ -1,74 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import collections
|
||||
import json
|
||||
import os
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
from .dataset import AudioClassificationDataset
|
||||
|
||||
__all__ = ['HeySnips']
|
||||
|
||||
|
||||
class HeySnips(AudioClassificationDataset):
|
||||
meta_info = collections.namedtuple('META_INFO',
|
||||
('key', 'label', 'duration', 'wav'))
|
||||
|
||||
def __init__(self,
|
||||
data_dir: os.PathLike,
|
||||
mode: str='train',
|
||||
feat_type: str='kaldi_fbank',
|
||||
sample_rate: int=16000,
|
||||
**kwargs):
|
||||
self.data_dir = data_dir
|
||||
files, labels = self._get_data(mode)
|
||||
super(HeySnips, self).__init__(
|
||||
files=files,
|
||||
labels=labels,
|
||||
feat_type=feat_type,
|
||||
sample_rate=sample_rate,
|
||||
**kwargs)
|
||||
|
||||
def _get_meta_info(self, mode) -> List[collections.namedtuple]:
|
||||
ret = []
|
||||
with open(os.path.join(self.data_dir, '{}.json'.format(mode)),
|
||||
'r') as f:
|
||||
data = json.load(f)
|
||||
for item in data:
|
||||
sample = collections.OrderedDict()
|
||||
if item['duration'] > 0:
|
||||
sample['key'] = item['id']
|
||||
sample['label'] = 0 if item['is_hotword'] == 1 else -1
|
||||
sample['duration'] = item['duration']
|
||||
sample['wav'] = os.path.join(self.data_dir,
|
||||
item['audio_file_path'])
|
||||
ret.append(self.meta_info(*sample.values()))
|
||||
return ret
|
||||
|
||||
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
|
||||
meta_info = self._get_meta_info(mode)
|
||||
|
||||
files = []
|
||||
labels = []
|
||||
self.keys = []
|
||||
self.durations = []
|
||||
for sample in meta_info:
|
||||
key, target, duration, wav = sample
|
||||
files.append(wav)
|
||||
labels.append(int(target))
|
||||
self.keys.append(key)
|
||||
self.durations.append(float(duration))
|
||||
|
||||
return files, labels
|
@ -1,200 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import collections
|
||||
import csv
|
||||
import os
|
||||
import random
|
||||
from typing import List
|
||||
|
||||
from paddle.io import Dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from ..utils import DATA_HOME
|
||||
from ..utils.download import download_and_decompress
|
||||
from .dataset import feat_funcs
|
||||
|
||||
__all__ = ['OpenRIRNoise']
|
||||
|
||||
|
||||
class OpenRIRNoise(Dataset):
|
||||
archieves = [
|
||||
{
|
||||
'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
|
||||
'md5': 'e6f48e257286e05de56413b4779d8ffb',
|
||||
},
|
||||
]
|
||||
|
||||
sample_rate = 16000
|
||||
meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
|
||||
base_path = os.path.join(DATA_HOME, 'open_rir_noise')
|
||||
wav_path = os.path.join(base_path, 'RIRS_NOISES')
|
||||
csv_path = os.path.join(base_path, 'csv')
|
||||
subsets = ['rir', 'noise']
|
||||
|
||||
def __init__(self,
|
||||
subset: str='rir',
|
||||
feat_type: str='raw',
|
||||
target_dir=None,
|
||||
random_chunk: bool=True,
|
||||
chunk_duration: float=3.0,
|
||||
seed: int=0,
|
||||
**kwargs):
|
||||
|
||||
assert subset in self.subsets, \
|
||||
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
|
||||
|
||||
self.subset = subset
|
||||
self.feat_type = feat_type
|
||||
self.feat_config = kwargs
|
||||
self.random_chunk = random_chunk
|
||||
self.chunk_duration = chunk_duration
|
||||
|
||||
OpenRIRNoise.csv_path = os.path.join(
|
||||
target_dir, "open_rir_noise",
|
||||
"csv") if target_dir else self.csv_path
|
||||
self._data = self._get_data()
|
||||
super(OpenRIRNoise, self).__init__()
|
||||
|
||||
# Set up a seed to reproduce training or predicting result.
|
||||
# random.seed(seed)
|
||||
|
||||
def _get_data(self):
|
||||
# Download audio files.
|
||||
print(f"rirs noises base path: {self.base_path}")
|
||||
if not os.path.isdir(self.base_path):
|
||||
download_and_decompress(
|
||||
self.archieves, self.base_path, decompress=True)
|
||||
else:
|
||||
print(
|
||||
f"{self.base_path} already exists, we will not download and decompress again"
|
||||
)
|
||||
|
||||
# Data preparation.
|
||||
print(f"prepare the csv to {self.csv_path}")
|
||||
if not os.path.isdir(self.csv_path):
|
||||
os.makedirs(self.csv_path)
|
||||
self.prepare_data()
|
||||
|
||||
data = []
|
||||
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
|
||||
for line in rf.readlines()[1:]:
|
||||
audio_id, duration, wav = line.strip().split(',')
|
||||
data.append(self.meta_info(audio_id, float(duration), wav))
|
||||
|
||||
random.shuffle(data)
|
||||
return data
|
||||
|
||||
def _convert_to_record(self, idx: int):
|
||||
sample = self._data[idx]
|
||||
|
||||
record = {}
|
||||
# To show all fields in a namedtuple: `type(sample)._fields`
|
||||
for field in type(sample)._fields:
|
||||
record[field] = getattr(sample, field)
|
||||
|
||||
waveform, sr = paddlespeech.audio.load(record['wav'])
|
||||
|
||||
assert self.feat_type in feat_funcs.keys(), \
|
||||
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
|
||||
feat_func = feat_funcs[self.feat_type]
|
||||
feat = feat_func(
|
||||
waveform, sr=sr, **self.feat_config) if feat_func else waveform
|
||||
|
||||
record.update({'feat': feat})
|
||||
return record
|
||||
|
||||
@staticmethod
|
||||
def _get_chunks(seg_dur, audio_id, audio_duration):
|
||||
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
|
||||
|
||||
chunk_lst = [
|
||||
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
|
||||
for i in range(num_chunks)
|
||||
]
|
||||
return chunk_lst
|
||||
|
||||
def _get_audio_info(self, wav_file: str,
|
||||
split_chunks: bool) -> List[List[str]]:
|
||||
waveform, sr = paddlespeech.audio.load(wav_file)
|
||||
audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
|
||||
audio_duration = waveform.shape[0] / sr
|
||||
|
||||
ret = []
|
||||
if split_chunks and audio_duration > self.chunk_duration: # Split into pieces of self.chunk_duration seconds.
|
||||
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
|
||||
audio_duration)
|
||||
|
||||
for idx, chunk in enumerate(uniq_chunks_list):
|
||||
s, e = chunk.split("_")[-2:] # Timestamps of start and end
|
||||
start_sample = int(float(s) * sr)
|
||||
end_sample = int(float(e) * sr)
|
||||
new_wav_file = os.path.join(self.base_path,
|
||||
audio_id + f'_chunk_{idx+1:02}.wav')
|
||||
paddlespeech.audio.save(waveform[start_sample:end_sample], sr,
|
||||
new_wav_file)
|
||||
# id, duration, new_wav
|
||||
ret.append([chunk, self.chunk_duration, new_wav_file])
|
||||
else: # Keep whole audio.
|
||||
ret.append([audio_id, audio_duration, wav_file])
|
||||
return ret
|
||||
|
||||
def generate_csv(self,
|
||||
wav_files: List[str],
|
||||
output_file: str,
|
||||
split_chunks: bool=True):
|
||||
print(f'Generating csv: {output_file}')
|
||||
header = ["id", "duration", "wav"]
|
||||
|
||||
infos = list(
|
||||
tqdm(
|
||||
map(self._get_audio_info, wav_files, [split_chunks] * len(
|
||||
wav_files)),
|
||||
total=len(wav_files)))
|
||||
|
||||
csv_lines = []
|
||||
for info in infos:
|
||||
csv_lines.extend(info)
|
||||
|
||||
with open(output_file, mode="w") as csv_f:
|
||||
csv_writer = csv.writer(
|
||||
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||
csv_writer.writerow(header)
|
||||
for line in csv_lines:
|
||||
csv_writer.writerow(line)
|
||||
|
||||
def prepare_data(self):
|
||||
rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
|
||||
"rir_list")
|
||||
rir_files = []
|
||||
with open(rir_list, 'r') as f:
|
||||
for line in f.readlines():
|
||||
rir_file = line.strip().split(' ')[-1]
|
||||
rir_files.append(os.path.join(self.base_path, rir_file))
|
||||
|
||||
noise_list = os.path.join(self.wav_path, "pointsource_noises",
|
||||
"noise_list")
|
||||
noise_files = []
|
||||
with open(noise_list, 'r') as f:
|
||||
for line in f.readlines():
|
||||
noise_file = line.strip().split(' ')[-1]
|
||||
noise_files.append(os.path.join(self.base_path, noise_file))
|
||||
|
||||
self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
|
||||
self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self._convert_to_record(idx)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._data)
|
@ -1,126 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import collections
|
||||
import os
|
||||
import random
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
from ..utils import DATA_HOME
|
||||
from ..utils.download import download_and_decompress
|
||||
from .dataset import AudioClassificationDataset
|
||||
|
||||
__all__ = ['TESS']
|
||||
|
||||
|
||||
class TESS(AudioClassificationDataset):
|
||||
"""
|
||||
TESS is a set of 200 target words were spoken in the carrier phrase
|
||||
"Say the word _____' by two actresses (aged 26 and 64 years) and
|
||||
recordings were made of the set portraying each of seven emotions(anger,
|
||||
disgust, fear, happiness, pleasant surprise, sadness, and neutral).
|
||||
There are 2800 stimuli in total.
|
||||
|
||||
Reference:
|
||||
Toronto emotional speech set (TESS)
|
||||
https://doi.org/10.5683/SP2/E8H2MF
|
||||
"""
|
||||
|
||||
archieves = [
|
||||
{
|
||||
'url':
|
||||
'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
|
||||
'md5':
|
||||
'1465311b24d1de704c4c63e4ccc470c7',
|
||||
},
|
||||
]
|
||||
label_list = [
|
||||
'angry',
|
||||
'disgust',
|
||||
'fear',
|
||||
'happy',
|
||||
'neutral',
|
||||
'ps', # pleasant surprise
|
||||
'sad',
|
||||
]
|
||||
meta_info = collections.namedtuple('META_INFO',
|
||||
('speaker', 'word', 'emotion'))
|
||||
audio_path = 'TESS_Toronto_emotional_speech_set'
|
||||
|
||||
def __init__(self,
|
||||
mode='train',
|
||||
seed=0,
|
||||
n_folds=5,
|
||||
split=1,
|
||||
feat_type='raw',
|
||||
**kwargs):
|
||||
"""
|
||||
Ags:
|
||||
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||
It identifies the dataset mode (train or dev).
|
||||
seed (:obj:`int`, `optional`, defaults to 0):
|
||||
Set the random seed to shuffle samples.
|
||||
n_folds (:obj:`int`, `optional`, defaults to 5):
|
||||
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
|
||||
split (:obj:`int`, `optional`, defaults to 1):
|
||||
It specify the fold of dev dataset.
|
||||
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||
It identifies the feature type that user wants to extrace of an audio file.
|
||||
"""
|
||||
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
|
||||
files, labels = self._get_data(mode, seed, n_folds, split)
|
||||
super(TESS, self).__init__(
|
||||
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||
|
||||
def _get_meta_info(self, files) -> List[collections.namedtuple]:
|
||||
ret = []
|
||||
for file in files:
|
||||
basename_without_extend = os.path.basename(file)[:-4]
|
||||
ret.append(self.meta_info(*basename_without_extend.split('_')))
|
||||
return ret
|
||||
|
||||
def _get_data(self, mode, seed, n_folds,
|
||||
split) -> Tuple[List[str], List[int]]:
|
||||
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
|
||||
download_and_decompress(self.archieves, DATA_HOME)
|
||||
|
||||
wav_files = []
|
||||
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
|
||||
for file in files:
|
||||
if file.endswith('.wav'):
|
||||
wav_files.append(os.path.join(root, file))
|
||||
|
||||
random.seed(seed) # shuffle samples to split data
|
||||
random.shuffle(
|
||||
wav_files
|
||||
) # make sure using the same seed to create train and dev dataset
|
||||
meta_info = self._get_meta_info(wav_files)
|
||||
|
||||
files = []
|
||||
labels = []
|
||||
n_samples_per_fold = len(meta_info) // n_folds
|
||||
for idx, sample in enumerate(meta_info):
|
||||
_, _, emotion = sample
|
||||
target = self.label_list.index(emotion)
|
||||
fold = idx // n_samples_per_fold + 1
|
||||
|
||||
if mode == 'train' and int(fold) != split:
|
||||
files.append(wav_files[idx])
|
||||
labels.append(target)
|
||||
|
||||
if mode != 'train' and int(fold) == split:
|
||||
files.append(wav_files[idx])
|
||||
labels.append(target)
|
||||
|
||||
return files, labels
|
@ -1,104 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import collections
|
||||
import os
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
from ..utils import DATA_HOME
|
||||
from ..utils.download import download_and_decompress
|
||||
from .dataset import AudioClassificationDataset
|
||||
|
||||
__all__ = ['UrbanSound8K']
|
||||
|
||||
|
||||
class UrbanSound8K(AudioClassificationDataset):
|
||||
"""
|
||||
UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
|
||||
sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
|
||||
drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
|
||||
classes are drawn from the urban sound taxonomy.
|
||||
|
||||
Reference:
|
||||
A Dataset and Taxonomy for Urban Sound Research
|
||||
https://dl.acm.org/doi/10.1145/2647868.2655045
|
||||
"""
|
||||
|
||||
archieves = [
|
||||
{
|
||||
'url':
|
||||
'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
|
||||
'md5': '9aa69802bbf37fb986f71ec1483a196e',
|
||||
},
|
||||
]
|
||||
label_list = [
|
||||
"air_conditioner", "car_horn", "children_playing", "dog_bark",
|
||||
"drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
|
||||
"street_music"
|
||||
]
|
||||
meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
|
||||
meta_info = collections.namedtuple(
|
||||
'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
|
||||
'class_id', 'label'))
|
||||
audio_path = os.path.join('UrbanSound8K', 'audio')
|
||||
|
||||
def __init__(self,
|
||||
mode: str='train',
|
||||
split: int=1,
|
||||
feat_type: str='raw',
|
||||
**kwargs):
|
||||
files, labels = self._get_data(mode, split)
|
||||
super(UrbanSound8K, self).__init__(
|
||||
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||
"""
|
||||
Ags:
|
||||
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||
It identifies the dataset mode (train or dev).
|
||||
split (:obj:`int`, `optional`, defaults to 1):
|
||||
It specify the fold of dev dataset.
|
||||
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||
It identifies the feature type that user wants to extrace of an audio file.
|
||||
"""
|
||||
|
||||
def _get_meta_info(self):
|
||||
ret = []
|
||||
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
|
||||
for line in rf.readlines()[1:]:
|
||||
ret.append(self.meta_info(*line.strip().split(',')))
|
||||
return ret
|
||||
|
||||
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
|
||||
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||
download_and_decompress(self.archieves, DATA_HOME)
|
||||
|
||||
meta_info = self._get_meta_info()
|
||||
|
||||
files = []
|
||||
labels = []
|
||||
for sample in meta_info:
|
||||
filename, _, _, _, _, fold, target, _ = sample
|
||||
if mode == 'train' and int(fold) != split:
|
||||
files.append(
|
||||
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
|
||||
filename))
|
||||
labels.append(int(target))
|
||||
|
||||
if mode != 'train' and int(fold) == split:
|
||||
files.append(
|
||||
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
|
||||
filename))
|
||||
labels.append(int(target))
|
||||
|
||||
return files, labels
|
@ -1,355 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import collections
|
||||
import csv
|
||||
import glob
|
||||
import os
|
||||
import random
|
||||
from multiprocessing import cpu_count
|
||||
from typing import List
|
||||
|
||||
from paddle.io import Dataset
|
||||
from pathos.multiprocessing import Pool
|
||||
from tqdm import tqdm
|
||||
|
||||
from ..utils import DATA_HOME
|
||||
from ..utils import decompress
|
||||
from ..utils.download import download_and_decompress
|
||||
from .dataset import feat_funcs
|
||||
|
||||
__all__ = ['VoxCeleb']
|
||||
|
||||
|
||||
class VoxCeleb(Dataset):
|
||||
source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
|
||||
archieves_audio_dev = [
|
||||
{
|
||||
'url': source_url + 'vox1_dev_wav_partaa',
|
||||
'md5': 'e395d020928bc15670b570a21695ed96',
|
||||
},
|
||||
{
|
||||
'url': source_url + 'vox1_dev_wav_partab',
|
||||
'md5': 'bbfaaccefab65d82b21903e81a8a8020',
|
||||
},
|
||||
{
|
||||
'url': source_url + 'vox1_dev_wav_partac',
|
||||
'md5': '017d579a2a96a077f40042ec33e51512',
|
||||
},
|
||||
{
|
||||
'url': source_url + 'vox1_dev_wav_partad',
|
||||
'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
|
||||
},
|
||||
]
|
||||
archieves_audio_test = [
|
||||
{
|
||||
'url': source_url + 'vox1_test_wav.zip',
|
||||
'md5': '185fdc63c3c739954633d50379a3d102',
|
||||
},
|
||||
]
|
||||
archieves_meta = [
|
||||
{
|
||||
'url':
|
||||
'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
|
||||
'md5':
|
||||
'b73110731c9223c1461fe49cb48dddfc',
|
||||
},
|
||||
]
|
||||
|
||||
num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
|
||||
sample_rate = 16000
|
||||
meta_info = collections.namedtuple(
|
||||
'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
|
||||
base_path = os.path.join(DATA_HOME, 'vox1')
|
||||
wav_path = os.path.join(base_path, 'wav')
|
||||
meta_path = os.path.join(base_path, 'meta')
|
||||
veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
|
||||
csv_path = os.path.join(base_path, 'csv')
|
||||
subsets = ['train', 'dev', 'enroll', 'test']
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
subset: str='train',
|
||||
feat_type: str='raw',
|
||||
random_chunk: bool=True,
|
||||
chunk_duration: float=3.0, # seconds
|
||||
split_ratio: float=0.9, # train split ratio
|
||||
seed: int=0,
|
||||
target_dir: str=None,
|
||||
vox2_base_path=None,
|
||||
**kwargs):
|
||||
"""VoxCeleb data prepare and get the specific dataset audio info
|
||||
|
||||
Args:
|
||||
subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
|
||||
feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
|
||||
random_chunk (bool, optional): random select a duration from audio. Defaults to True.
|
||||
chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
|
||||
target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
|
||||
vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
|
||||
"""
|
||||
assert subset in self.subsets, \
|
||||
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
|
||||
|
||||
self.subset = subset
|
||||
self.spk_id2label = {}
|
||||
self.feat_type = feat_type
|
||||
self.feat_config = kwargs
|
||||
self.random_chunk = random_chunk
|
||||
self.chunk_duration = chunk_duration
|
||||
self.split_ratio = split_ratio
|
||||
self.target_dir = target_dir if target_dir else VoxCeleb.base_path
|
||||
self.vox2_base_path = vox2_base_path
|
||||
|
||||
# if we set the target dir, we will change the vox data info data from base path to target dir
|
||||
VoxCeleb.csv_path = os.path.join(
|
||||
target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
|
||||
VoxCeleb.meta_path = os.path.join(
|
||||
target_dir, "voxceleb",
|
||||
'meta') if target_dir else VoxCeleb.meta_path
|
||||
VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
|
||||
'veri_test2.txt')
|
||||
# self._data = self._get_data()[:1000] # KP: Small dataset test.
|
||||
self._data = self._get_data()
|
||||
super(VoxCeleb, self).__init__()
|
||||
|
||||
# Set up a seed to reproduce training or predicting result.
|
||||
# random.seed(seed)
|
||||
|
||||
def _get_data(self):
|
||||
# Download audio files.
|
||||
# We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
|
||||
# so, we check the vox1/wav dir status
|
||||
print(f"wav base path: {self.wav_path}")
|
||||
if not os.path.isdir(self.wav_path):
|
||||
print("start to download the voxceleb1 dataset")
|
||||
download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip
|
||||
self.archieves_audio_dev,
|
||||
self.base_path,
|
||||
decompress=False)
|
||||
download_and_decompress( # download the vox1_test_wav.zip and unzip
|
||||
self.archieves_audio_test,
|
||||
self.base_path,
|
||||
decompress=True)
|
||||
|
||||
# Download all parts and concatenate the files into one zip file.
|
||||
dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
|
||||
print(f'Concatenating all parts to: {dev_zipfile}')
|
||||
os.system(
|
||||
f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
|
||||
)
|
||||
|
||||
# Extract all audio files of dev and test set.
|
||||
decompress(dev_zipfile, self.base_path)
|
||||
|
||||
# Download meta files.
|
||||
if not os.path.isdir(self.meta_path):
|
||||
print("prepare the meta data")
|
||||
download_and_decompress(
|
||||
self.archieves_meta, self.meta_path, decompress=False)
|
||||
|
||||
# Data preparation.
|
||||
if not os.path.isdir(self.csv_path):
|
||||
os.makedirs(self.csv_path)
|
||||
self.prepare_data()
|
||||
|
||||
data = []
|
||||
print(
|
||||
f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
|
||||
)
|
||||
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
|
||||
for line in rf.readlines()[1:]:
|
||||
audio_id, duration, wav, start, stop, spk_id = line.strip(
|
||||
).split(',')
|
||||
data.append(
|
||||
self.meta_info(audio_id,
|
||||
float(duration), wav,
|
||||
int(start), int(stop), spk_id))
|
||||
|
||||
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
|
||||
for line in f.readlines():
|
||||
spk_id, label = line.strip().split(' ')
|
||||
self.spk_id2label[spk_id] = int(label)
|
||||
|
||||
return data
|
||||
|
||||
def _convert_to_record(self, idx: int):
|
||||
sample = self._data[idx]
|
||||
|
||||
record = {}
|
||||
# To show all fields in a namedtuple: `type(sample)._fields`
|
||||
for field in type(sample)._fields:
|
||||
record[field] = getattr(sample, field)
|
||||
|
||||
waveform, sr = paddlespeech.audio.load(record['wav'])
|
||||
|
||||
# random select a chunk audio samples from the audio
|
||||
if self.random_chunk:
|
||||
num_wav_samples = waveform.shape[0]
|
||||
num_chunk_samples = int(self.chunk_duration * sr)
|
||||
start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
|
||||
stop = start + num_chunk_samples
|
||||
else:
|
||||
start = record['start']
|
||||
stop = record['stop']
|
||||
|
||||
waveform = waveform[start:stop]
|
||||
|
||||
assert self.feat_type in feat_funcs.keys(), \
|
||||
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
|
||||
feat_func = feat_funcs[self.feat_type]
|
||||
feat = feat_func(
|
||||
waveform, sr=sr, **self.feat_config) if feat_func else waveform
|
||||
|
||||
record.update({'feat': feat})
|
||||
if self.subset in ['train',
|
||||
'dev']: # Labels are available in train and dev.
|
||||
record.update({'label': self.spk_id2label[record['spk_id']]})
|
||||
|
||||
return record
|
||||
|
||||
@staticmethod
|
||||
def _get_chunks(seg_dur, audio_id, audio_duration):
|
||||
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
|
||||
|
||||
chunk_lst = [
|
||||
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
|
||||
for i in range(num_chunks)
|
||||
]
|
||||
return chunk_lst
|
||||
|
||||
def _get_audio_info(self, wav_file: str,
|
||||
split_chunks: bool) -> List[List[str]]:
|
||||
waveform, sr = paddlespeech.audio.load(wav_file)
|
||||
spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
|
||||
audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
|
||||
audio_duration = waveform.shape[0] / sr
|
||||
|
||||
ret = []
|
||||
if split_chunks: # Split into pieces of self.chunk_duration seconds.
|
||||
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
|
||||
audio_duration)
|
||||
|
||||
for chunk in uniq_chunks_list:
|
||||
s, e = chunk.split("_")[-2:] # Timestamps of start and end
|
||||
start_sample = int(float(s) * sr)
|
||||
end_sample = int(float(e) * sr)
|
||||
# id, duration, wav, start, stop, spk_id
|
||||
ret.append([
|
||||
chunk, audio_duration, wav_file, start_sample, end_sample,
|
||||
spk_id
|
||||
])
|
||||
else: # Keep whole audio.
|
||||
ret.append([
|
||||
audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
|
||||
])
|
||||
return ret
|
||||
|
||||
def generate_csv(self,
|
||||
wav_files: List[str],
|
||||
output_file: str,
|
||||
split_chunks: bool=True):
|
||||
print(f'Generating csv: {output_file}')
|
||||
header = ["id", "duration", "wav", "start", "stop", "spk_id"]
|
||||
# Note: this may occurs c++ execption, but the program will execute fine
|
||||
# so we can ignore the execption
|
||||
with Pool(cpu_count()) as p:
|
||||
infos = list(
|
||||
tqdm(
|
||||
p.imap(lambda x: self._get_audio_info(x, split_chunks),
|
||||
wav_files),
|
||||
total=len(wav_files)))
|
||||
|
||||
csv_lines = []
|
||||
for info in infos:
|
||||
csv_lines.extend(info)
|
||||
|
||||
with open(output_file, mode="w") as csv_f:
|
||||
csv_writer = csv.writer(
|
||||
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||
csv_writer.writerow(header)
|
||||
for line in csv_lines:
|
||||
csv_writer.writerow(line)
|
||||
|
||||
def prepare_data(self):
|
||||
# Audio of speakers in veri_test_file should not be included in training set.
|
||||
print("start to prepare the data csv file")
|
||||
enroll_files = set()
|
||||
test_files = set()
|
||||
# get the enroll and test audio file path
|
||||
with open(self.veri_test_file, 'r') as f:
|
||||
for line in f.readlines():
|
||||
_, enrol_file, test_file = line.strip().split(' ')
|
||||
enroll_files.add(os.path.join(self.wav_path, enrol_file))
|
||||
test_files.add(os.path.join(self.wav_path, test_file))
|
||||
enroll_files = sorted(enroll_files)
|
||||
test_files = sorted(test_files)
|
||||
|
||||
# get the enroll and test speakers
|
||||
test_spks = set()
|
||||
for file in (enroll_files + test_files):
|
||||
spk = file.split('/wav/')[1].split('/')[0]
|
||||
test_spks.add(spk)
|
||||
|
||||
# get all the train and dev audios file path
|
||||
audio_files = []
|
||||
speakers = set()
|
||||
print("Getting file list...")
|
||||
for path in [self.wav_path, self.vox2_base_path]:
|
||||
# if vox2 directory is not set and vox2 is not a directory
|
||||
# we will not process this directory
|
||||
if not path or not os.path.exists(path):
|
||||
print(f"{path} is an invalid path, please check again, "
|
||||
"and we will ignore the vox2 base path")
|
||||
continue
|
||||
for file in glob.glob(
|
||||
os.path.join(path, "**", "*.wav"), recursive=True):
|
||||
spk = file.split('/wav/')[1].split('/')[0]
|
||||
if spk in test_spks:
|
||||
continue
|
||||
speakers.add(spk)
|
||||
audio_files.append(file)
|
||||
|
||||
print(
|
||||
f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
|
||||
)
|
||||
# encode the train and dev speakers label to spk_id2label.txt
|
||||
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
|
||||
for label, spk_id in enumerate(
|
||||
sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2
|
||||
f.write(f'{spk_id} {label}\n')
|
||||
|
||||
audio_files = sorted(audio_files)
|
||||
random.shuffle(audio_files)
|
||||
split_idx = int(self.split_ratio * len(audio_files))
|
||||
# split_ratio to train
|
||||
train_files, dev_files = audio_files[:split_idx], audio_files[
|
||||
split_idx:]
|
||||
|
||||
self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
|
||||
self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
|
||||
|
||||
self.generate_csv(
|
||||
enroll_files,
|
||||
os.path.join(self.csv_path, 'enroll.csv'),
|
||||
split_chunks=False)
|
||||
self.generate_csv(
|
||||
test_files,
|
||||
os.path.join(self.csv_path, 'test.csv'),
|
||||
split_chunks=False)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self._convert_to_record(idx)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._data)
|
@ -1,17 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .layers import LogMelSpectrogram
|
||||
from .layers import MelSpectrogram
|
||||
from .layers import MFCC
|
||||
from .layers import Spectrogram
|
@ -1,328 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
from typing import Union
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
from paddle import Tensor
|
||||
|
||||
from ..functional import compute_fbank_matrix
|
||||
from ..functional import create_dct
|
||||
from ..functional import power_to_db
|
||||
from ..functional.window import get_window
|
||||
|
||||
__all__ = [
|
||||
'Spectrogram',
|
||||
'MelSpectrogram',
|
||||
'LogMelSpectrogram',
|
||||
'MFCC',
|
||||
]
|
||||
|
||||
|
||||
class Spectrogram(nn.Layer):
|
||||
"""Compute spectrogram of given signals, typically audio waveforms.
|
||||
The spectorgram is defined as the complex norm of the short-time Fourier transformation.
|
||||
|
||||
Args:
|
||||
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
|
||||
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
|
||||
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
|
||||
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
|
||||
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
|
||||
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
|
||||
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
power: float=2.0,
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
dtype: str='float32') -> None:
|
||||
super(Spectrogram, self).__init__()
|
||||
|
||||
assert power > 0, 'Power of spectrogram must be > 0.'
|
||||
self.power = power
|
||||
|
||||
if win_length is None:
|
||||
win_length = n_fft
|
||||
|
||||
self.fft_window = get_window(
|
||||
window, win_length, fftbins=True, dtype=dtype)
|
||||
self._stft = partial(
|
||||
paddle.signal.stft,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=self.fft_window,
|
||||
center=center,
|
||||
pad_mode=pad_mode)
|
||||
self.register_buffer('fft_window', self.fft_window)
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
"""
|
||||
Args:
|
||||
x (Tensor): Tensor of waveforms with shape `(N, T)`
|
||||
|
||||
Returns:
|
||||
Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
|
||||
"""
|
||||
stft = self._stft(x)
|
||||
spectrogram = paddle.pow(paddle.abs(stft), self.power)
|
||||
return spectrogram
|
||||
|
||||
|
||||
class MelSpectrogram(nn.Layer):
|
||||
"""Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
|
||||
|
||||
Args:
|
||||
sr (int, optional): Sample rate. Defaults to 22050.
|
||||
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
|
||||
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
|
||||
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
|
||||
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
|
||||
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
|
||||
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
|
||||
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
|
||||
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
|
||||
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
|
||||
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
sr: int=22050,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
power: float=2.0,
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
n_mels: int=64,
|
||||
f_min: float=50.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
dtype: str='float32') -> None:
|
||||
super(MelSpectrogram, self).__init__()
|
||||
|
||||
self._spectrogram = Spectrogram(
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
power=power,
|
||||
center=center,
|
||||
pad_mode=pad_mode,
|
||||
dtype=dtype)
|
||||
self.n_mels = n_mels
|
||||
self.f_min = f_min
|
||||
self.f_max = f_max
|
||||
self.htk = htk
|
||||
self.norm = norm
|
||||
if f_max is None:
|
||||
f_max = sr // 2
|
||||
self.fbank_matrix = compute_fbank_matrix(
|
||||
sr=sr,
|
||||
n_fft=n_fft,
|
||||
n_mels=n_mels,
|
||||
f_min=f_min,
|
||||
f_max=f_max,
|
||||
htk=htk,
|
||||
norm=norm,
|
||||
dtype=dtype) # float64 for better numerical results
|
||||
self.register_buffer('fbank_matrix', self.fbank_matrix)
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
"""
|
||||
Args:
|
||||
x (Tensor): Tensor of waveforms with shape `(N, T)`
|
||||
|
||||
Returns:
|
||||
Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
|
||||
"""
|
||||
spect_feature = self._spectrogram(x)
|
||||
mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
|
||||
return mel_feature
|
||||
|
||||
|
||||
class LogMelSpectrogram(nn.Layer):
|
||||
"""Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
|
||||
|
||||
Args:
|
||||
sr (int, optional): Sample rate. Defaults to 22050.
|
||||
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
|
||||
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
|
||||
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
|
||||
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
|
||||
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
|
||||
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
|
||||
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
|
||||
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
|
||||
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
|
||||
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
|
||||
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
|
||||
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
sr: int=22050,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
power: float=2.0,
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
n_mels: int=64,
|
||||
f_min: float=50.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
ref_value: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=None,
|
||||
dtype: str='float32') -> None:
|
||||
super(LogMelSpectrogram, self).__init__()
|
||||
|
||||
self._melspectrogram = MelSpectrogram(
|
||||
sr=sr,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
power=power,
|
||||
center=center,
|
||||
pad_mode=pad_mode,
|
||||
n_mels=n_mels,
|
||||
f_min=f_min,
|
||||
f_max=f_max,
|
||||
htk=htk,
|
||||
norm=norm,
|
||||
dtype=dtype)
|
||||
|
||||
self.ref_value = ref_value
|
||||
self.amin = amin
|
||||
self.top_db = top_db
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
"""
|
||||
Args:
|
||||
x (Tensor): Tensor of waveforms with shape `(N, T)`
|
||||
|
||||
Returns:
|
||||
Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
|
||||
"""
|
||||
mel_feature = self._melspectrogram(x)
|
||||
log_mel_feature = power_to_db(
|
||||
mel_feature,
|
||||
ref_value=self.ref_value,
|
||||
amin=self.amin,
|
||||
top_db=self.top_db)
|
||||
return log_mel_feature
|
||||
|
||||
|
||||
class MFCC(nn.Layer):
|
||||
"""Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
|
||||
|
||||
Args:
|
||||
sr (int, optional): Sample rate. Defaults to 22050.
|
||||
n_mfcc (int, optional): [description]. Defaults to 40.
|
||||
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
|
||||
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
|
||||
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
|
||||
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
|
||||
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
|
||||
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
|
||||
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
|
||||
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
|
||||
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
|
||||
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
|
||||
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
|
||||
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
sr: int=22050,
|
||||
n_mfcc: int=40,
|
||||
n_fft: int=512,
|
||||
hop_length: Optional[int]=None,
|
||||
win_length: Optional[int]=None,
|
||||
window: str='hann',
|
||||
power: float=2.0,
|
||||
center: bool=True,
|
||||
pad_mode: str='reflect',
|
||||
n_mels: int=64,
|
||||
f_min: float=50.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
ref_value: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=None,
|
||||
dtype: str=paddle.float32) -> None:
|
||||
super(MFCC, self).__init__()
|
||||
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
|
||||
n_mfcc, n_mels)
|
||||
self._log_melspectrogram = LogMelSpectrogram(
|
||||
sr=sr,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
win_length=win_length,
|
||||
window=window,
|
||||
power=power,
|
||||
center=center,
|
||||
pad_mode=pad_mode,
|
||||
n_mels=n_mels,
|
||||
f_min=f_min,
|
||||
f_max=f_max,
|
||||
htk=htk,
|
||||
norm=norm,
|
||||
ref_value=ref_value,
|
||||
amin=amin,
|
||||
top_db=top_db,
|
||||
dtype=dtype)
|
||||
self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
|
||||
self.register_buffer('dct_matrix', self.dct_matrix)
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
"""
|
||||
Args:
|
||||
x (Tensor): Tensor of waveforms with shape `(N, T)`
|
||||
|
||||
Returns:
|
||||
Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
|
||||
"""
|
||||
log_mel_feature = self._log_melspectrogram(x)
|
||||
mfcc = paddle.matmul(
|
||||
log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
|
||||
(0, 2, 1)) # (B, n_mels, L)
|
||||
return mfcc
|
@ -1,20 +0,0 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .functional import compute_fbank_matrix
|
||||
from .functional import create_dct
|
||||
from .functional import fft_frequencies
|
||||
from .functional import hz_to_mel
|
||||
from .functional import mel_frequencies
|
||||
from .functional import mel_to_hz
|
||||
from .functional import power_to_db
|
@ -1,266 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from librosa(https://github.com/librosa/librosa)
|
||||
import math
|
||||
from typing import Optional
|
||||
from typing import Union
|
||||
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
|
||||
__all__ = [
|
||||
'hz_to_mel',
|
||||
'mel_to_hz',
|
||||
'mel_frequencies',
|
||||
'fft_frequencies',
|
||||
'compute_fbank_matrix',
|
||||
'power_to_db',
|
||||
'create_dct',
|
||||
]
|
||||
|
||||
|
||||
def hz_to_mel(freq: Union[Tensor, float],
|
||||
htk: bool=False) -> Union[Tensor, float]:
|
||||
"""Convert Hz to Mels.
|
||||
|
||||
Args:
|
||||
freq (Union[Tensor, float]): The input tensor with arbitrary shape.
|
||||
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||
|
||||
Returns:
|
||||
Union[Tensor, float]: Frequency in mels.
|
||||
"""
|
||||
|
||||
if htk:
|
||||
if isinstance(freq, Tensor):
|
||||
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
|
||||
else:
|
||||
return 2595.0 * math.log10(1.0 + freq / 700.0)
|
||||
|
||||
# Fill in the linear part
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
|
||||
mels = (freq - f_min) / f_sp
|
||||
|
||||
# Fill in the log-scale part
|
||||
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||
|
||||
if isinstance(freq, Tensor):
|
||||
target = min_log_mel + paddle.log(
|
||||
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
|
||||
mask = (freq > min_log_hz).astype(freq.dtype)
|
||||
mels = target * mask + mels * (
|
||||
1 - mask) # will replace by masked_fill OP in future
|
||||
else:
|
||||
if freq >= min_log_hz:
|
||||
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
|
||||
|
||||
return mels
|
||||
|
||||
|
||||
def mel_to_hz(mel: Union[float, Tensor],
|
||||
htk: bool=False) -> Union[float, Tensor]:
|
||||
"""Convert mel bin numbers to frequencies.
|
||||
|
||||
Args:
|
||||
mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
|
||||
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||
|
||||
Returns:
|
||||
Union[float, Tensor]: Frequencies in Hz.
|
||||
"""
|
||||
if htk:
|
||||
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
|
||||
|
||||
f_min = 0.0
|
||||
f_sp = 200.0 / 3
|
||||
freqs = f_min + f_sp * mel
|
||||
# And now the nonlinear scale
|
||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||
if isinstance(mel, Tensor):
|
||||
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
|
||||
mask = (mel > min_log_mel).astype(mel.dtype)
|
||||
freqs = target * mask + freqs * (
|
||||
1 - mask) # will replace by masked_fill OP in future
|
||||
else:
|
||||
if mel >= min_log_mel:
|
||||
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
|
||||
|
||||
return freqs
|
||||
|
||||
|
||||
def mel_frequencies(n_mels: int=64,
|
||||
f_min: float=0.0,
|
||||
f_max: float=11025.0,
|
||||
htk: bool=False,
|
||||
dtype: str='float32') -> Tensor:
|
||||
"""Compute mel frequencies.
|
||||
|
||||
Args:
|
||||
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
|
||||
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
|
||||
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
|
||||
|
||||
Returns:
|
||||
Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
|
||||
"""
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
min_mel = hz_to_mel(f_min, htk=htk)
|
||||
max_mel = hz_to_mel(f_max, htk=htk)
|
||||
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
|
||||
freqs = mel_to_hz(mels, htk=htk)
|
||||
return freqs
|
||||
|
||||
|
||||
def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
|
||||
"""Compute fourier frequencies.
|
||||
|
||||
Args:
|
||||
sr (int): Sample rate.
|
||||
n_fft (int): Number of fft bins.
|
||||
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
|
||||
|
||||
Returns:
|
||||
Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
|
||||
"""
|
||||
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
|
||||
|
||||
|
||||
def compute_fbank_matrix(sr: int,
|
||||
n_fft: int,
|
||||
n_mels: int=64,
|
||||
f_min: float=0.0,
|
||||
f_max: Optional[float]=None,
|
||||
htk: bool=False,
|
||||
norm: Union[str, float]='slaney',
|
||||
dtype: str='float32') -> Tensor:
|
||||
"""Compute fbank matrix.
|
||||
|
||||
Args:
|
||||
sr (int): Sample rate.
|
||||
n_fft (int): Number of fft bins.
|
||||
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
|
||||
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||
norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
|
||||
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
|
||||
|
||||
Returns:
|
||||
Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
|
||||
"""
|
||||
|
||||
if f_max is None:
|
||||
f_max = float(sr) / 2
|
||||
|
||||
# Initialize the weights
|
||||
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
||||
|
||||
# Center freqs of each FFT bin
|
||||
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
|
||||
|
||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||
mel_f = mel_frequencies(
|
||||
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
|
||||
|
||||
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
|
||||
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
|
||||
#ramps = np.subtract.outer(mel_f, fftfreqs)
|
||||
|
||||
for i in range(n_mels):
|
||||
# lower and upper slopes for all bins
|
||||
lower = -ramps[i] / fdiff[i]
|
||||
upper = ramps[i + 2] / fdiff[i + 1]
|
||||
|
||||
# .. then intersect them with each other and zero
|
||||
weights[i] = paddle.maximum(
|
||||
paddle.zeros_like(lower), paddle.minimum(lower, upper))
|
||||
|
||||
# Slaney-style mel is scaled to be approx constant energy per channel
|
||||
if norm == 'slaney':
|
||||
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
|
||||
weights *= enorm.unsqueeze(1)
|
||||
elif isinstance(norm, int) or isinstance(norm, float):
|
||||
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def power_to_db(spect: Tensor,
|
||||
ref_value: float=1.0,
|
||||
amin: float=1e-10,
|
||||
top_db: Optional[float]=None) -> Tensor:
|
||||
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
|
||||
|
||||
Args:
|
||||
spect (Tensor): STFT power spectrogram.
|
||||
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||
amin (float, optional): Minimum threshold. Defaults to 1e-10.
|
||||
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Tensor: Power spectrogram in db scale.
|
||||
"""
|
||||
if amin <= 0:
|
||||
raise Exception("amin must be strictly positive")
|
||||
|
||||
if ref_value <= 0:
|
||||
raise Exception("ref_value must be strictly positive")
|
||||
|
||||
ones = paddle.ones_like(spect)
|
||||
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
|
||||
log_spec -= 10.0 * math.log10(max(ref_value, amin))
|
||||
|
||||
if top_db is not None:
|
||||
if top_db < 0:
|
||||
raise Exception("top_db must be non-negative")
|
||||
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
|
||||
|
||||
return log_spec
|
||||
|
||||
|
||||
def create_dct(n_mfcc: int,
|
||||
n_mels: int,
|
||||
norm: Optional[str]='ortho',
|
||||
dtype: str='float32') -> Tensor:
|
||||
"""Create a discrete cosine transform(DCT) matrix.
|
||||
|
||||
Args:
|
||||
n_mfcc (int): Number of mel frequency cepstral coefficients.
|
||||
n_mels (int): Number of mel filterbanks.
|
||||
norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
|
||||
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
|
||||
|
||||
Returns:
|
||||
Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
|
||||
"""
|
||||
n = paddle.arange(n_mels, dtype=dtype)
|
||||
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
|
||||
dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
|
||||
k) # size (n_mfcc, n_mels)
|
||||
if norm is None:
|
||||
dct *= 2.0
|
||||
else:
|
||||
assert norm == "ortho"
|
||||
dct[0] *= 1.0 / math.sqrt(2.0)
|
||||
dct *= math.sqrt(2.0 / float(n_mels))
|
||||
return dct.T
|
@ -1,337 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
import math
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
import paddle
|
||||
from paddle import Tensor
|
||||
|
||||
__all__ = [
|
||||
'get_window',
|
||||
]
|
||||
|
||||
|
||||
def _cat(x: List[Tensor], data_type: str) -> Tensor:
|
||||
l = [paddle.to_tensor(_, data_type) for _ in x]
|
||||
return paddle.concat(l)
|
||||
|
||||
|
||||
def _acosh(x: Union[Tensor, float]) -> Tensor:
|
||||
if isinstance(x, float):
|
||||
return math.log(x + math.sqrt(x**2 - 1))
|
||||
return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
|
||||
|
||||
|
||||
def _extend(M: int, sym: bool) -> bool:
|
||||
"""Extend window by 1 sample if needed for DFT-even symmetry. """
|
||||
if not sym:
|
||||
return M + 1, True
|
||||
else:
|
||||
return M, False
|
||||
|
||||
|
||||
def _len_guards(M: int) -> bool:
|
||||
"""Handle small or incorrect window lengths. """
|
||||
if int(M) != M or M < 0:
|
||||
raise ValueError('Window length M must be a non-negative integer')
|
||||
|
||||
return M <= 1
|
||||
|
||||
|
||||
def _truncate(w: Tensor, needed: bool) -> Tensor:
|
||||
"""Truncate window by 1 sample if needed for DFT-even symmetry. """
|
||||
if needed:
|
||||
return w[:-1]
|
||||
else:
|
||||
return w
|
||||
|
||||
|
||||
def _general_gaussian(M: int, p, sig, sym: bool=True,
|
||||
dtype: str='float64') -> Tensor:
|
||||
"""Compute a window with a generalized Gaussian shape.
|
||||
This function is consistent with scipy.signal.windows.general_gaussian().
|
||||
"""
|
||||
if _len_guards(M):
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
M, needs_trunc = _extend(M, sym)
|
||||
|
||||
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
|
||||
w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
|
||||
|
||||
return _truncate(w, needs_trunc)
|
||||
|
||||
|
||||
def _general_cosine(M: int, a: float, sym: bool=True,
|
||||
dtype: str='float64') -> Tensor:
|
||||
"""Compute a generic weighted sum of cosine terms window.
|
||||
This function is consistent with scipy.signal.windows.general_cosine().
|
||||
"""
|
||||
if _len_guards(M):
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
M, needs_trunc = _extend(M, sym)
|
||||
fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
|
||||
w = paddle.zeros((M, ), dtype=dtype)
|
||||
for k in range(len(a)):
|
||||
w += a[k] * paddle.cos(k * fac)
|
||||
return _truncate(w, needs_trunc)
|
||||
|
||||
|
||||
def _general_hamming(M: int, alpha: float, sym: bool=True,
|
||||
dtype: str='float64') -> Tensor:
|
||||
"""Compute a generalized Hamming window.
|
||||
This function is consistent with scipy.signal.windows.general_hamming()
|
||||
"""
|
||||
return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
|
||||
|
||||
|
||||
def _taylor(M: int,
|
||||
nbar=4,
|
||||
sll=30,
|
||||
norm=True,
|
||||
sym: bool=True,
|
||||
dtype: str='float64') -> Tensor:
|
||||
"""Compute a Taylor window.
|
||||
The Taylor window taper function approximates the Dolph-Chebyshev window's
|
||||
constant sidelobe level for a parameterized number of near-in sidelobes.
|
||||
"""
|
||||
if _len_guards(M):
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
M, needs_trunc = _extend(M, sym)
|
||||
# Original text uses a negative sidelobe level parameter and then negates
|
||||
# it in the calculation of B. To keep consistent with other methods we
|
||||
# assume the sidelobe level parameter to be positive.
|
||||
B = 10**(sll / 20)
|
||||
A = _acosh(B) / math.pi
|
||||
s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
|
||||
ma = paddle.arange(1, nbar, dtype=dtype)
|
||||
|
||||
Fm = paddle.empty((nbar - 1, ), dtype=dtype)
|
||||
signs = paddle.empty_like(ma)
|
||||
signs[::2] = 1
|
||||
signs[1::2] = -1
|
||||
m2 = ma * ma
|
||||
for mi in range(len(ma)):
|
||||
numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
|
||||
))
|
||||
if mi == 0:
|
||||
denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
|
||||
elif mi == len(ma) - 1:
|
||||
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
|
||||
else:
|
||||
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
|
||||
mi] / m2[mi + 1:])
|
||||
|
||||
Fm[mi] = numer / denom
|
||||
|
||||
def W(n):
|
||||
return 1 + 2 * paddle.matmul(
|
||||
Fm.unsqueeze(0),
|
||||
paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
|
||||
|
||||
w = W(paddle.arange(0, M, dtype=dtype))
|
||||
|
||||
# normalize (Note that this is not described in the original text [1])
|
||||
if norm:
|
||||
scale = 1.0 / W((M - 1) / 2)
|
||||
w *= scale
|
||||
w = w.squeeze()
|
||||
return _truncate(w, needs_trunc)
|
||||
|
||||
|
||||
def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||
"""Compute a Hamming window.
|
||||
The Hamming window is a taper formed by using a raised cosine with
|
||||
non-zero endpoints, optimized to minimize the nearest side lobe.
|
||||
"""
|
||||
return _general_hamming(M, 0.54, sym, dtype=dtype)
|
||||
|
||||
|
||||
def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||
"""Compute a Hann window.
|
||||
The Hann window is a taper formed by using a raised cosine or sine-squared
|
||||
with ends that touch zero.
|
||||
"""
|
||||
return _general_hamming(M, 0.5, sym, dtype=dtype)
|
||||
|
||||
|
||||
def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||
"""Compute a Tukey window.
|
||||
The Tukey window is also known as a tapered cosine window.
|
||||
"""
|
||||
if _len_guards(M):
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
|
||||
if alpha <= 0:
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
elif alpha >= 1.0:
|
||||
return hann(M, sym=sym)
|
||||
|
||||
M, needs_trunc = _extend(M, sym)
|
||||
|
||||
n = paddle.arange(0, M, dtype=dtype)
|
||||
width = int(alpha * (M - 1) / 2.0)
|
||||
n1 = n[0:width + 1]
|
||||
n2 = n[width + 1:M - width - 1]
|
||||
n3 = n[M - width - 1:]
|
||||
|
||||
w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
|
||||
w2 = paddle.ones(n2.shape, dtype=dtype)
|
||||
w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
|
||||
(M - 1))))
|
||||
w = paddle.concat([w1, w2, w3])
|
||||
|
||||
return _truncate(w, needs_trunc)
|
||||
|
||||
|
||||
def _kaiser(M: int, beta: float, sym: bool=True,
|
||||
dtype: str='float64') -> Tensor:
|
||||
"""Compute a Kaiser window.
|
||||
The Kaiser window is a taper formed by using a Bessel function.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def _gaussian(M: int, std: float, sym: bool=True,
|
||||
dtype: str='float64') -> Tensor:
|
||||
"""Compute a Gaussian window.
|
||||
The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
|
||||
"""
|
||||
if _len_guards(M):
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
M, needs_trunc = _extend(M, sym)
|
||||
|
||||
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
|
||||
sig2 = 2 * std * std
|
||||
w = paddle.exp(-n**2 / sig2)
|
||||
|
||||
return _truncate(w, needs_trunc)
|
||||
|
||||
|
||||
def _exponential(M: int,
|
||||
center=None,
|
||||
tau=1.,
|
||||
sym: bool=True,
|
||||
dtype: str='float64') -> Tensor:
|
||||
"""Compute an exponential (or Poisson) window. """
|
||||
if sym and center is not None:
|
||||
raise ValueError("If sym==True, center must be None.")
|
||||
if _len_guards(M):
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
M, needs_trunc = _extend(M, sym)
|
||||
|
||||
if center is None:
|
||||
center = (M - 1) / 2
|
||||
|
||||
n = paddle.arange(0, M, dtype=dtype)
|
||||
w = paddle.exp(-paddle.abs(n - center) / tau)
|
||||
|
||||
return _truncate(w, needs_trunc)
|
||||
|
||||
|
||||
def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||
"""Compute a triangular window.
|
||||
"""
|
||||
if _len_guards(M):
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
M, needs_trunc = _extend(M, sym)
|
||||
|
||||
n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
|
||||
if M % 2 == 0:
|
||||
w = (2 * n - 1.0) / M
|
||||
w = paddle.concat([w, w[::-1]])
|
||||
else:
|
||||
w = 2 * n / (M + 1.0)
|
||||
w = paddle.concat([w, w[-2::-1]])
|
||||
|
||||
return _truncate(w, needs_trunc)
|
||||
|
||||
|
||||
def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||
"""Compute a Bohman window.
|
||||
The Bohman window is the autocorrelation of a cosine window.
|
||||
"""
|
||||
if _len_guards(M):
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
M, needs_trunc = _extend(M, sym)
|
||||
|
||||
fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
|
||||
w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
|
||||
math.pi * fac)
|
||||
w = _cat([0, w, 0], dtype)
|
||||
|
||||
return _truncate(w, needs_trunc)
|
||||
|
||||
|
||||
def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||
"""Compute a Blackman window.
|
||||
The Blackman window is a taper formed by using the first three terms of
|
||||
a summation of cosines. It was designed to have close to the minimal
|
||||
leakage possible. It is close to optimal, only slightly worse than a
|
||||
Kaiser window.
|
||||
"""
|
||||
return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
|
||||
|
||||
|
||||
def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||
"""Compute a window with a simple cosine shape.
|
||||
"""
|
||||
if _len_guards(M):
|
||||
return paddle.ones((M, ), dtype=dtype)
|
||||
M, needs_trunc = _extend(M, sym)
|
||||
w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
|
||||
|
||||
return _truncate(w, needs_trunc)
|
||||
|
||||
|
||||
def get_window(window: Union[str, Tuple[str, float]],
|
||||
win_length: int,
|
||||
fftbins: bool=True,
|
||||
dtype: str='float64') -> Tensor:
|
||||
"""Return a window of a given length and type.
|
||||
|
||||
Args:
|
||||
window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
|
||||
win_length (int): Number of samples.
|
||||
fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
|
||||
dtype (str, optional): The data type of the return window. Defaults to 'float64'.
|
||||
|
||||
Returns:
|
||||
Tensor: The window represented as a tensor.
|
||||
"""
|
||||
sym = not fftbins
|
||||
|
||||
args = ()
|
||||
if isinstance(window, tuple):
|
||||
winstr = window[0]
|
||||
if len(window) > 1:
|
||||
args = window[1:]
|
||||
elif isinstance(window, str):
|
||||
if window in ['gaussian', 'exponential']:
|
||||
raise ValueError("The '" + window + "' window needs one or "
|
||||
"more parameters -- pass a tuple.")
|
||||
else:
|
||||
winstr = window
|
||||
else:
|
||||
raise ValueError("%s as window type is not supported." %
|
||||
str(type(window)))
|
||||
|
||||
try:
|
||||
winfunc = eval('_' + winstr)
|
||||
except KeyError as e:
|
||||
raise ValueError("Unknown window type.") from e
|
||||
|
||||
params = (win_length, ) + args
|
||||
kwargs = {'sym': sym}
|
||||
return winfunc(*params, dtype=dtype, **kwargs)
|
@ -1,13 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -1,15 +0,0 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .kaldi import fbank
|
||||
from .kaldi import pitch
|
@ -1,132 +0,0 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddlespeech
|
||||
from paddlespeech.audio._internal import module_utils
|
||||
|
||||
__all__ = [
|
||||
'fbank',
|
||||
'pitch',
|
||||
]
|
||||
|
||||
|
||||
@module_utils.requires_kaldi()
|
||||
def fbank(
|
||||
wav,
|
||||
samp_freq: int=16000,
|
||||
frame_shift_ms: float=10.0,
|
||||
frame_length_ms: float=25.0,
|
||||
dither: float=0.0,
|
||||
preemph_coeff: float=0.97,
|
||||
remove_dc_offset: bool=True,
|
||||
window_type: str='povey',
|
||||
round_to_power_of_two: bool=True,
|
||||
blackman_coeff: float=0.42,
|
||||
snip_edges: bool=True,
|
||||
allow_downsample: bool=False,
|
||||
allow_upsample: bool=False,
|
||||
max_feature_vectors: int=-1,
|
||||
num_bins: int=23,
|
||||
low_freq: float=20,
|
||||
high_freq: float=0,
|
||||
vtln_low: float=100,
|
||||
vtln_high: float=-500,
|
||||
debug_mel: bool=False,
|
||||
htk_mode: bool=False,
|
||||
use_energy: bool=False, # fbank opts
|
||||
energy_floor: float=0.0,
|
||||
raw_energy: bool=True,
|
||||
htk_compat: bool=False,
|
||||
use_log_fbank: bool=True,
|
||||
use_power: bool=True):
|
||||
frame_opts = paddlespeech.audio._paddleaudio.FrameExtractionOptions()
|
||||
mel_opts = paddlespeech.audio._paddleaudio.MelBanksOptions()
|
||||
fbank_opts = paddlespeech.audio._paddleaudio.FbankOptions()
|
||||
frame_opts.samp_freq = samp_freq
|
||||
frame_opts.frame_shift_ms = frame_shift_ms
|
||||
frame_opts.frame_length_ms = frame_length_ms
|
||||
frame_opts.dither = dither
|
||||
frame_opts.preemph_coeff = preemph_coeff
|
||||
frame_opts.remove_dc_offset = remove_dc_offset
|
||||
frame_opts.window_type = window_type
|
||||
frame_opts.round_to_power_of_two = round_to_power_of_two
|
||||
frame_opts.blackman_coeff = blackman_coeff
|
||||
frame_opts.snip_edges = snip_edges
|
||||
frame_opts.allow_downsample = allow_downsample
|
||||
frame_opts.allow_upsample = allow_upsample
|
||||
frame_opts.max_feature_vectors = max_feature_vectors
|
||||
|
||||
mel_opts.num_bins = num_bins
|
||||
mel_opts.low_freq = low_freq
|
||||
mel_opts.high_freq = high_freq
|
||||
mel_opts.vtln_low = vtln_low
|
||||
mel_opts.vtln_high = vtln_high
|
||||
mel_opts.debug_mel = debug_mel
|
||||
mel_opts.htk_mode = htk_mode
|
||||
|
||||
fbank_opts.use_energy = use_energy
|
||||
fbank_opts.energy_floor = energy_floor
|
||||
fbank_opts.raw_energy = raw_energy
|
||||
fbank_opts.htk_compat = htk_compat
|
||||
fbank_opts.use_log_fbank = use_log_fbank
|
||||
fbank_opts.use_power = use_power
|
||||
feat = paddlespeech.audio._paddleaudio.ComputeFbank(frame_opts, mel_opts, fbank_opts, wav)
|
||||
return feat
|
||||
|
||||
|
||||
@module_utils.requires_kaldi()
|
||||
def pitch(wav,
|
||||
samp_freq: int=16000,
|
||||
frame_shift_ms: float=10.0,
|
||||
frame_length_ms: float=25.0,
|
||||
preemph_coeff: float=0.0,
|
||||
min_f0: int=50,
|
||||
max_f0: int=400,
|
||||
soft_min_f0: float=10.0,
|
||||
penalty_factor: float=0.1,
|
||||
lowpass_cutoff: int=1000,
|
||||
resample_freq: int=4000,
|
||||
delta_pitch: float=0.005,
|
||||
nccf_ballast: int=7000,
|
||||
lowpass_filter_width: int=1,
|
||||
upsample_filter_width: int=5,
|
||||
max_frames_latency: int=0,
|
||||
frames_per_chunk: int=0,
|
||||
simulate_first_pass_online: bool=False,
|
||||
recompute_frame: int=500,
|
||||
nccf_ballast_online: bool=False,
|
||||
snip_edges: bool=True):
|
||||
pitch_opts = paddlespeech.audio._paddleaudio.PitchExtractionOptions()
|
||||
pitch_opts.samp_freq = samp_freq
|
||||
pitch_opts.frame_shift_ms = frame_shift_ms
|
||||
pitch_opts.frame_length_ms = frame_length_ms
|
||||
pitch_opts.preemph_coeff = preemph_coeff
|
||||
pitch_opts.min_f0 = min_f0
|
||||
pitch_opts.max_f0 = max_f0
|
||||
pitch_opts.soft_min_f0 = soft_min_f0
|
||||
pitch_opts.penalty_factor = penalty_factor
|
||||
pitch_opts.lowpass_cutoff = lowpass_cutoff
|
||||
pitch_opts.resample_freq = resample_freq
|
||||
pitch_opts.delta_pitch = delta_pitch
|
||||
pitch_opts.nccf_ballast = nccf_ballast
|
||||
pitch_opts.lowpass_filter_width = lowpass_filter_width
|
||||
pitch_opts.upsample_filter_width = upsample_filter_width
|
||||
pitch_opts.max_frames_latency = max_frames_latency
|
||||
pitch_opts.frames_per_chunk = frames_per_chunk
|
||||
pitch_opts.simulate_first_pass_online = simulate_first_pass_online
|
||||
pitch_opts.recompute_frame = recompute_frame
|
||||
pitch_opts.nccf_ballast_online = nccf_ballast_online
|
||||
pitch_opts.snip_edges = snip_edges
|
||||
pitch = paddlespeech.audio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
|
||||
return pitch
|
@ -1,15 +0,0 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .eer import compute_eer
|
||||
from .eer import compute_minDCF
|
@ -1,100 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from sklearn.metrics import roc_curve
|
||||
|
||||
|
||||
def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
|
||||
"""Compute EER and return score threshold.
|
||||
|
||||
Args:
|
||||
labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
|
||||
scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
|
||||
|
||||
Returns:
|
||||
List[float]: eer and the specific threshold
|
||||
"""
|
||||
fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
|
||||
fnr = 1 - tpr
|
||||
eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
|
||||
eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
|
||||
return eer, eer_threshold
|
||||
|
||||
|
||||
def compute_minDCF(positive_scores,
|
||||
negative_scores,
|
||||
c_miss=1.0,
|
||||
c_fa=1.0,
|
||||
p_target=0.01):
|
||||
"""
|
||||
This is modified from SpeechBrain
|
||||
https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
|
||||
Computes the minDCF metric normally used to evaluate speaker verification
|
||||
systems. The min_DCF is the minimum of the following C_det function computed
|
||||
within the defined threshold range:
|
||||
|
||||
C_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
|
||||
|
||||
where p_miss is the missing probability and p_fa is the probability of having
|
||||
a false alarm.
|
||||
|
||||
Args:
|
||||
positive_scores (Paddle.Tensor): The scores from entries of the same class.
|
||||
negative_scores (Paddle.Tensor): The scores from entries of different classes.
|
||||
c_miss (float, optional): Cost assigned to a missing error (default 1.0).
|
||||
c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
|
||||
p_target (float, optional): Prior probability of having a target (default 0.01).
|
||||
|
||||
Returns:
|
||||
List[float]: min dcf and the specific threshold
|
||||
"""
|
||||
# Computing candidate thresholds
|
||||
if len(positive_scores.shape) > 1:
|
||||
positive_scores = positive_scores.squeeze()
|
||||
|
||||
if len(negative_scores.shape) > 1:
|
||||
negative_scores = negative_scores.squeeze()
|
||||
|
||||
thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
|
||||
thresholds = paddle.unique(thresholds)
|
||||
|
||||
# Adding intermediate thresholds
|
||||
interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
|
||||
thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
|
||||
|
||||
# Computing False Rejection Rate (miss detection)
|
||||
positive_scores = paddle.concat(
|
||||
len(thresholds) * [positive_scores.unsqueeze(0)])
|
||||
pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
|
||||
p_miss = (pos_scores_threshold.sum(0)
|
||||
).astype("float32") / positive_scores.shape[1]
|
||||
del positive_scores
|
||||
del pos_scores_threshold
|
||||
|
||||
# Computing False Acceptance Rate (false alarm)
|
||||
negative_scores = paddle.concat(
|
||||
len(thresholds) * [negative_scores.unsqueeze(0)])
|
||||
neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
|
||||
p_fa = (neg_scores_threshold.sum(0)
|
||||
).astype("float32") / negative_scores.shape[1]
|
||||
del negative_scores
|
||||
del neg_scores_threshold
|
||||
|
||||
c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
|
||||
c_min = paddle.min(c_det, axis=0)
|
||||
min_index = paddle.argmin(c_det, axis=0)
|
||||
return float(c_min), float(thresholds[min_index])
|
@ -1,25 +0,0 @@
|
||||
from paddlespeech.audio._internal import module_utils as _mod_utils
|
||||
|
||||
from .sox_effects import (
|
||||
apply_effects_file,
|
||||
apply_effects_tensor,
|
||||
effect_names,
|
||||
init_sox_effects,
|
||||
shutdown_sox_effects,
|
||||
)
|
||||
|
||||
|
||||
if _mod_utils.is_sox_available():
|
||||
import atexit
|
||||
|
||||
init_sox_effects()
|
||||
atexit.register(shutdown_sox_effects)
|
||||
|
||||
__all__ = [
|
||||
"init_sox_effects",
|
||||
"shutdown_sox_effects",
|
||||
"effect_names",
|
||||
"apply_effects_tensor",
|
||||
"apply_effects_file",
|
||||
]
|
||||
|
@ -1,238 +0,0 @@
|
||||
import os
|
||||
from typing import List, Optional, Tuple
|
||||
import paddle
|
||||
import numpy
|
||||
|
||||
from paddlespeech.audio._internal import module_utils as _mod_utils
|
||||
from paddlespeech.audio.utils.sox_utils import list_effects
|
||||
from paddlespeech.audio import _paddleaudio as paddleaudio
|
||||
|
||||
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/sox_effects/sox_effects.py
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def init_sox_effects():
|
||||
"""Initialize resources required to use sox effects.
|
||||
|
||||
Note:
|
||||
You do not need to call this function manually. It is called automatically.
|
||||
|
||||
Once initialized, you do not need to call this function again across the multiple uses of
|
||||
sox effects though it is safe to do so as long as :func:`shutdown_sox_effects` is not called yet.
|
||||
Once :func:`shutdown_sox_effects` is called, you can no longer use SoX effects and initializing
|
||||
again will result in error.
|
||||
"""
|
||||
paddleaudio.sox_effects_initialize_sox_effects()
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def shutdown_sox_effects():
|
||||
"""Clean up resources required to use sox effects.
|
||||
|
||||
Note:
|
||||
You do not need to call this function manually. It is called automatically.
|
||||
|
||||
It is safe to call this function multiple times.
|
||||
Once :py:func:`shutdown_sox_effects` is called, you can no longer use SoX effects and
|
||||
initializing again will result in error.
|
||||
"""
|
||||
paddleaudio.sox_effects_shutdown_sox_effects()
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def effect_names() -> List[str]:
|
||||
"""Gets list of valid sox effect names
|
||||
|
||||
Returns:
|
||||
List[str]: list of available effect names.
|
||||
|
||||
Example
|
||||
>>> paddleaudio.sox_effects.effect_names()
|
||||
['allpass', 'band', 'bandpass', ... ]
|
||||
"""
|
||||
return list(list_effects().keys())
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def apply_effects_tensor(
|
||||
tensor: paddle.Tensor,
|
||||
sample_rate: int,
|
||||
effects: List[List[str]],
|
||||
channels_first: bool = True,
|
||||
) -> Tuple[paddle.Tensor, int]:
|
||||
"""Apply sox effects to given Tensor
|
||||
|
||||
.. devices:: CPU
|
||||
|
||||
Note:
|
||||
This function only works on CPU Tensors.
|
||||
This function works in the way very similar to ``sox`` command, however there are slight
|
||||
differences. For example, ``sox`` command adds certain effects automatically (such as
|
||||
``rate`` effect after ``speed`` and ``pitch`` and other effects), but this function does
|
||||
only applies the given effects. (Therefore, to actually apply ``speed`` effect, you also
|
||||
need to give ``rate`` effect with desired sampling rate.).
|
||||
|
||||
Args:
|
||||
tensor (paddle.Tensor): Input 2D CPU Tensor.
|
||||
sample_rate (int): Sample rate
|
||||
effects (List[List[str]]): List of effects.
|
||||
channels_first (bool, optional): Indicates if the input Tensor's dimension is
|
||||
`[channels, time]` or `[time, channels]`
|
||||
|
||||
Returns:
|
||||
(Tensor, int): Resulting Tensor and sample rate.
|
||||
The resulting Tensor has the same ``dtype`` as the input Tensor, and
|
||||
the same channels order. The shape of the Tensor can be different based on the
|
||||
effects applied. Sample rate can also be different based on the effects applied.
|
||||
|
||||
Example - Basic usage
|
||||
>>>
|
||||
>>> # Defines the effects to apply
|
||||
>>> effects = [
|
||||
... ['gain', '-n'], # normalises to 0dB
|
||||
... ['pitch', '5'], # 5 cent pitch shift
|
||||
... ['rate', '8000'], # resample to 8000 Hz
|
||||
... ]
|
||||
>>>
|
||||
>>> # Generate pseudo wave:
|
||||
>>> # normalized, channels first, 2ch, sampling rate 16000, 1 second
|
||||
>>> sample_rate = 16000
|
||||
>>> waveform = 2 * paddle.rand([2, sample_rate * 1]) - 1
|
||||
>>> waveform.shape
|
||||
paddle.Size([2, 16000])
|
||||
>>> waveform
|
||||
tensor([[ 0.3138, 0.7620, -0.9019, ..., -0.7495, -0.4935, 0.5442],
|
||||
[-0.0832, 0.0061, 0.8233, ..., -0.5176, -0.9140, -0.2434]])
|
||||
>>>
|
||||
>>> # Apply effects
|
||||
>>> waveform, sample_rate = apply_effects_tensor(
|
||||
... wave_form, sample_rate, effects, channels_first=True)
|
||||
>>>
|
||||
>>> # Check the result
|
||||
>>> # The new waveform is sampling rate 8000, 1 second.
|
||||
>>> # normalization and channel order are preserved
|
||||
>>> waveform.shape
|
||||
paddle.Size([2, 8000])
|
||||
>>> waveform
|
||||
tensor([[ 0.5054, -0.5518, -0.4800, ..., -0.0076, 0.0096, -0.0110],
|
||||
[ 0.1331, 0.0436, -0.3783, ..., -0.0035, 0.0012, 0.0008]])
|
||||
>>> sample_rate
|
||||
8000
|
||||
|
||||
"""
|
||||
tensor_np = tensor.numpy()
|
||||
ret = paddleaudio.sox_effects_apply_effects_tensor(tensor_np, sample_rate, effects, channels_first)
|
||||
if ret is not None:
|
||||
return (paddle.to_tensor(ret[0]), ret[1])
|
||||
raise RuntimeError("Failed to apply sox effect")
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def apply_effects_file(
|
||||
path: str,
|
||||
effects: List[List[str]],
|
||||
normalize: bool = True,
|
||||
channels_first: bool = True,
|
||||
format: Optional[str] = None,
|
||||
) -> Tuple[paddle.Tensor, int]:
|
||||
"""Apply sox effects to the audio file and load the resulting data as Tensor
|
||||
|
||||
Note:
|
||||
This function works in the way very similar to ``sox`` command, however there are slight
|
||||
differences. For example, ``sox`` commnad adds certain effects automatically (such as
|
||||
``rate`` effect after ``speed``, ``pitch`` etc), but this function only applies the given
|
||||
effects. Therefore, to actually apply ``speed`` effect, you also need to give ``rate``
|
||||
effect with desired sampling rate, because internally, ``speed`` effects only alter sampling
|
||||
rate and leave samples untouched.
|
||||
|
||||
Args:
|
||||
path (path-like object or file-like object):
|
||||
effects (List[List[str]]): List of effects.
|
||||
normalize (bool, optional):
|
||||
When ``True``, this function always return ``float32``, and sample values are
|
||||
normalized to ``[-1.0, 1.0]``.
|
||||
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
||||
integer type. This argument has no effect for formats other
|
||||
than integer WAV type.
|
||||
channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`.
|
||||
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
||||
format (str or None, optional):
|
||||
Override the format detection with the given format.
|
||||
Providing the argument might help when libsox can not infer the format
|
||||
from header or extension,
|
||||
|
||||
Returns:
|
||||
(Tensor, int): Resulting Tensor and sample rate.
|
||||
If ``normalize=True``, the resulting Tensor is always ``float32`` type.
|
||||
If ``normalize=False`` and the input audio file is of integer WAV file, then the
|
||||
resulting Tensor has corresponding integer type. (Note 24 bit integer type is not supported)
|
||||
If ``channels_first=True``, the resulting Tensor has dimension `[channel, time]`,
|
||||
otherwise `[time, channel]`.
|
||||
|
||||
Example - Basic usage
|
||||
>>>
|
||||
>>> # Defines the effects to apply
|
||||
>>> effects = [
|
||||
... ['gain', '-n'], # normalises to 0dB
|
||||
... ['pitch', '5'], # 5 cent pitch shift
|
||||
... ['rate', '8000'], # resample to 8000 Hz
|
||||
... ]
|
||||
>>>
|
||||
>>> # Apply effects and load data with channels_first=True
|
||||
>>> waveform, sample_rate = apply_effects_file("data.wav", effects, channels_first=True)
|
||||
>>>
|
||||
>>> # Check the result
|
||||
>>> waveform.shape
|
||||
paddle.Size([2, 8000])
|
||||
>>> waveform
|
||||
tensor([[ 5.1151e-03, 1.8073e-02, 2.2188e-02, ..., 1.0431e-07,
|
||||
-1.4761e-07, 1.8114e-07],
|
||||
[-2.6924e-03, 2.1860e-03, 1.0650e-02, ..., 6.4122e-07,
|
||||
-5.6159e-07, 4.8103e-07]])
|
||||
>>> sample_rate
|
||||
8000
|
||||
|
||||
Example - Apply random speed perturbation to dataset
|
||||
>>>
|
||||
>>> # Load data from file, apply random speed perturbation
|
||||
>>> class RandomPerturbationFile(paddle.utils.data.Dataset):
|
||||
... \"\"\"Given flist, apply random speed perturbation
|
||||
...
|
||||
... Suppose all the input files are at least one second long.
|
||||
... \"\"\"
|
||||
... def __init__(self, flist: List[str], sample_rate: int):
|
||||
... super().__init__()
|
||||
... self.flist = flist
|
||||
... self.sample_rate = sample_rate
|
||||
...
|
||||
... def __getitem__(self, index):
|
||||
... speed = 0.5 + 1.5 * random.randn()
|
||||
... effects = [
|
||||
... ['gain', '-n', '-10'], # apply 10 db attenuation
|
||||
... ['remix', '-'], # merge all the channels
|
||||
... ['speed', f'{speed:.5f}'], # duration is now 0.5 ~ 2.0 seconds.
|
||||
... ['rate', f'{self.sample_rate}'],
|
||||
... ['pad', '0', '1.5'], # add 1.5 seconds silence at the end
|
||||
... ['trim', '0', '2'], # get the first 2 seconds
|
||||
... ]
|
||||
... waveform, _ = paddleaudio.sox_effects.apply_effects_file(
|
||||
... self.flist[index], effects)
|
||||
... return waveform
|
||||
...
|
||||
... def __len__(self):
|
||||
... return len(self.flist)
|
||||
...
|
||||
>>> dataset = RandomPerturbationFile(file_list, sample_rate=8000)
|
||||
>>> loader = paddle.utils.data.DataLoader(dataset, batch_size=32)
|
||||
>>> for batch in loader:
|
||||
>>> pass
|
||||
"""
|
||||
if hasattr(path, "read"):
|
||||
ret = paddleaudio.apply_effects_fileobj(path, effects, normalize, channels_first, format)
|
||||
if ret is None:
|
||||
raise RuntimeError("Failed to load audio from {}".format(path))
|
||||
return (paddle.to_tensor(ret[0]), ret[1])
|
||||
path = os.fspath(path)
|
||||
ret = paddleaudio.sox_effects_apply_effects_file(path, effects, normalize, channels_first, format)
|
||||
if ret is not None:
|
||||
return (paddle.to_tensor(ret[0]), ret[1])
|
||||
raise RuntimeError("Failed to load audio from {}".format(path))
|
@ -1,201 +0,0 @@
|
||||
if (MSVC)
|
||||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
endif()
|
||||
|
||||
################################################################################
|
||||
# libpaddleaudio
|
||||
################################################################################
|
||||
set(
|
||||
LIBPADDLEAUDIO_SOURCES
|
||||
utils.cpp
|
||||
)
|
||||
|
||||
set(
|
||||
LIBPADDLEAUDIO_INCLUDE_DIRS
|
||||
${PROJECT_SOURCE_DIR}
|
||||
)
|
||||
|
||||
set(
|
||||
LIBPADDLEAUDIO_LINK_LIBRARIES
|
||||
)
|
||||
|
||||
set(
|
||||
LIBPADDLEAUDIO_COMPILE_DEFINITIONS)
|
||||
|
||||
#------------------------------------------------------------------------------#
|
||||
# START OF CUSTOMIZATION LOGICS
|
||||
#------------------------------------------------------------------------------#
|
||||
|
||||
if(BUILD_SOX)
|
||||
list(
|
||||
APPEND
|
||||
LIBPADDLEAUDIO_LINK_LIBRARIES
|
||||
libsox
|
||||
)
|
||||
list(
|
||||
APPEND
|
||||
LIBPADDLEAUDIO_SOURCES
|
||||
#sox/io.cpp
|
||||
#sox/utils.cpp
|
||||
#sox/effects.cpp
|
||||
#sox/effects_chain.cpp
|
||||
#sox/types.cpp
|
||||
)
|
||||
list(
|
||||
APPEND
|
||||
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
|
||||
INCLUDE_SOX
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
if(BUILD_KALDI)
|
||||
list(
|
||||
APPEND
|
||||
LIBPADDLEAUDIO_LINK_LIBRARIES
|
||||
libkaldi
|
||||
)
|
||||
list(
|
||||
APPEND
|
||||
LIBPADDLEAUDIO_COMPILE_DEFINITIONS
|
||||
INCLUDE_KALDI
|
||||
COMPILE_WITHOUT_OPENFST
|
||||
)
|
||||
endif()
|
||||
|
||||
#------------------------------------------------------------------------------#
|
||||
# END OF CUSTOMIZATION LOGICS
|
||||
#------------------------------------------------------------------------------#
|
||||
|
||||
function (define_library name source include_dirs link_libraries compile_defs)
|
||||
add_library(${name} SHARED ${source})
|
||||
target_include_directories(${name} PRIVATE ${include_dirs})
|
||||
target_link_libraries(${name} ${link_libraries})
|
||||
target_compile_definitions(${name} PRIVATE ${compile_defs})
|
||||
set_target_properties(${name} PROPERTIES PREFIX "")
|
||||
if (MSVC)
|
||||
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
|
||||
endif(MSVC)
|
||||
install(
|
||||
TARGETS ${name}
|
||||
LIBRARY DESTINATION lib
|
||||
RUNTIME DESTINATION lib # For Windows
|
||||
)
|
||||
endfunction()
|
||||
|
||||
|
||||
define_library(
|
||||
libpaddleaudio
|
||||
"${LIBPADDLEAUDIO_SOURCES}"
|
||||
"${LIBPADDLEAUDIO_INCLUDE_DIRS}"
|
||||
"${LIBPADDLEAUDIO_LINK_LIBRARIES}"
|
||||
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
|
||||
)
|
||||
|
||||
if (APPLE)
|
||||
set(TORCHAUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
|
||||
else()
|
||||
set(TORCHAUDIO_LIBRARY -Wl,--no-as-needed libpaddleaudio -Wl,--as-needed CACHE INTERNAL "")
|
||||
endif()
|
||||
|
||||
################################################################################
|
||||
# _paddleaudio.so
|
||||
################################################################################
|
||||
if (BUILD_PADDLEAUDIO_PYTHON_EXTENSION)
|
||||
if (WIN32)
|
||||
find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
|
||||
set(ADDITIONAL_ITEMS Python3::Python)
|
||||
endif()
|
||||
function(define_extension name sources include_dirs libraries definitions)
|
||||
add_library(${name} SHARED ${sources})
|
||||
target_compile_definitions(${name} PRIVATE "${definitions}")
|
||||
target_include_directories(
|
||||
${name} PRIVATE ${PROJECT_SOURCE_DIR} ${Python_INCLUDE_DIR} ${pybind11_INCLUDE_DIR} ${include_dirs})
|
||||
target_link_libraries(
|
||||
${name}
|
||||
${libraries}
|
||||
${TORCH_PYTHON_LIBRARY}
|
||||
${ADDITIONAL_ITEMS}
|
||||
)
|
||||
set_target_properties(${name} PROPERTIES PREFIX "")
|
||||
if (MSVC)
|
||||
set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
|
||||
endif(MSVC)
|
||||
if (APPLE)
|
||||
# https://github.com/facebookarchive/caffe2/issues/854#issuecomment-364538485
|
||||
# https://github.com/pytorch/pytorch/commit/73f6715f4725a0723d8171d3131e09ac7abf0666
|
||||
set_target_properties(${name} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
|
||||
endif()
|
||||
install(
|
||||
TARGETS ${name}
|
||||
LIBRARY DESTINATION .
|
||||
RUNTIME DESTINATION . # For Windows
|
||||
)
|
||||
endfunction()
|
||||
|
||||
set(
|
||||
EXTENSION_SOURCES
|
||||
pybind/pybind.cpp
|
||||
)
|
||||
#----------------------------------------------------------------------------#
|
||||
# START OF CUSTOMIZATION LOGICS
|
||||
#----------------------------------------------------------------------------#
|
||||
if(BUILD_SOX)
|
||||
list(
|
||||
APPEND
|
||||
EXTENSION_SOURCES
|
||||
pybind/sox/effects.cpp
|
||||
pybind/sox/effects_chain.cpp
|
||||
pybind/sox/io.cpp
|
||||
pybind/sox/types.cpp
|
||||
pybind/sox/utils.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
if(BUILD_KALDI)
|
||||
list(
|
||||
APPEND
|
||||
EXTENSION_SOURCES
|
||||
pybind/kaldi/kaldi_feature_wrapper.cc
|
||||
pybind/kaldi/kaldi_feature.cc
|
||||
)
|
||||
endif()
|
||||
#----------------------------------------------------------------------------#
|
||||
# END OF CUSTOMIZATION LOGICS
|
||||
#----------------------------------------------------------------------------#
|
||||
define_extension(
|
||||
_paddleaudio
|
||||
"${EXTENSION_SOURCES}"
|
||||
""
|
||||
libpaddleaudio
|
||||
"${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
|
||||
)
|
||||
# if(BUILD_CTC_DECODER)
|
||||
# set(
|
||||
# DECODER_EXTENSION_SOURCES
|
||||
# decoder/bindings/pybind.cpp
|
||||
# )
|
||||
# define_extension(
|
||||
# _paddleaudio_decoder
|
||||
# "${DECODER_EXTENSION_SOURCES}"
|
||||
# ""
|
||||
# "libpaddleaudio_decoder"
|
||||
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
|
||||
# )
|
||||
# endif()
|
||||
# if(USE_FFMPEG)
|
||||
# set(
|
||||
# FFMPEG_EXTENSION_SOURCES
|
||||
# ffmpeg/pybind/typedefs.cpp
|
||||
# ffmpeg/pybind/pybind.cpp
|
||||
# ffmpeg/pybind/stream_reader.cpp
|
||||
# )
|
||||
# define_extension(
|
||||
# _paddleaudio_ffmpeg
|
||||
# "${FFMPEG_EXTENSION_SOURCES}"
|
||||
# "${FFMPEG_INCLUDE_DIRS}"
|
||||
# "libpaddleaudio_ffmpeg"
|
||||
# "${LIBPADDLEAUDIO_DECODER_DEFINITIONS}"
|
||||
# )
|
||||
# endif()
|
||||
endif()
|
@ -1,121 +0,0 @@
|
||||
Creative Commons Legal Code
|
||||
|
||||
CC0 1.0 Universal
|
||||
|
||||
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
|
||||
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
|
||||
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
|
||||
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
|
||||
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
|
||||
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
|
||||
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
|
||||
HEREUNDER.
|
||||
|
||||
Statement of Purpose
|
||||
|
||||
The laws of most jurisdictions throughout the world automatically confer
|
||||
exclusive Copyright and Related Rights (defined below) upon the creator
|
||||
and subsequent owner(s) (each and all, an "owner") of an original work of
|
||||
authorship and/or a database (each, a "Work").
|
||||
|
||||
Certain owners wish to permanently relinquish those rights to a Work for
|
||||
the purpose of contributing to a commons of creative, cultural and
|
||||
scientific works ("Commons") that the public can reliably and without fear
|
||||
of later claims of infringement build upon, modify, incorporate in other
|
||||
works, reuse and redistribute as freely as possible in any form whatsoever
|
||||
and for any purposes, including without limitation commercial purposes.
|
||||
These owners may contribute to the Commons to promote the ideal of a free
|
||||
culture and the further production of creative, cultural and scientific
|
||||
works, or to gain reputation or greater distribution for their Work in
|
||||
part through the use and efforts of others.
|
||||
|
||||
For these and/or other purposes and motivations, and without any
|
||||
expectation of additional consideration or compensation, the person
|
||||
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
|
||||
is an owner of Copyright and Related Rights in the Work, voluntarily
|
||||
elects to apply CC0 to the Work and publicly distribute the Work under its
|
||||
terms, with knowledge of his or her Copyright and Related Rights in the
|
||||
Work and the meaning and intended legal effect of CC0 on those rights.
|
||||
|
||||
1. Copyright and Related Rights. A Work made available under CC0 may be
|
||||
protected by copyright and related or neighboring rights ("Copyright and
|
||||
Related Rights"). Copyright and Related Rights include, but are not
|
||||
limited to, the following:
|
||||
|
||||
i. the right to reproduce, adapt, distribute, perform, display,
|
||||
communicate, and translate a Work;
|
||||
ii. moral rights retained by the original author(s) and/or performer(s);
|
||||
iii. publicity and privacy rights pertaining to a person's image or
|
||||
likeness depicted in a Work;
|
||||
iv. rights protecting against unfair competition in regards to a Work,
|
||||
subject to the limitations in paragraph 4(a), below;
|
||||
v. rights protecting the extraction, dissemination, use and reuse of data
|
||||
in a Work;
|
||||
vi. database rights (such as those arising under Directive 96/9/EC of the
|
||||
European Parliament and of the Council of 11 March 1996 on the legal
|
||||
protection of databases, and under any national implementation
|
||||
thereof, including any amended or successor version of such
|
||||
directive); and
|
||||
vii. other similar, equivalent or corresponding rights throughout the
|
||||
world based on applicable law or treaty, and any national
|
||||
implementations thereof.
|
||||
|
||||
2. Waiver. To the greatest extent permitted by, but not in contravention
|
||||
of, applicable law, Affirmer hereby overtly, fully, permanently,
|
||||
irrevocably and unconditionally waives, abandons, and surrenders all of
|
||||
Affirmer's Copyright and Related Rights and associated claims and causes
|
||||
of action, whether now known or unknown (including existing as well as
|
||||
future claims and causes of action), in the Work (i) in all territories
|
||||
worldwide, (ii) for the maximum duration provided by applicable law or
|
||||
treaty (including future time extensions), (iii) in any current or future
|
||||
medium and for any number of copies, and (iv) for any purpose whatsoever,
|
||||
including without limitation commercial, advertising or promotional
|
||||
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
|
||||
member of the public at large and to the detriment of Affirmer's heirs and
|
||||
successors, fully intending that such Waiver shall not be subject to
|
||||
revocation, rescission, cancellation, termination, or any other legal or
|
||||
equitable action to disrupt the quiet enjoyment of the Work by the public
|
||||
as contemplated by Affirmer's express Statement of Purpose.
|
||||
|
||||
3. Public License Fallback. Should any part of the Waiver for any reason
|
||||
be judged legally invalid or ineffective under applicable law, then the
|
||||
Waiver shall be preserved to the maximum extent permitted taking into
|
||||
account Affirmer's express Statement of Purpose. In addition, to the
|
||||
extent the Waiver is so judged Affirmer hereby grants to each affected
|
||||
person a royalty-free, non transferable, non sublicensable, non exclusive,
|
||||
irrevocable and unconditional license to exercise Affirmer's Copyright and
|
||||
Related Rights in the Work (i) in all territories worldwide, (ii) for the
|
||||
maximum duration provided by applicable law or treaty (including future
|
||||
time extensions), (iii) in any current or future medium and for any number
|
||||
of copies, and (iv) for any purpose whatsoever, including without
|
||||
limitation commercial, advertising or promotional purposes (the
|
||||
"License"). The License shall be deemed effective as of the date CC0 was
|
||||
applied by Affirmer to the Work. Should any part of the License for any
|
||||
reason be judged legally invalid or ineffective under applicable law, such
|
||||
partial invalidity or ineffectiveness shall not invalidate the remainder
|
||||
of the License, and in such case Affirmer hereby affirms that he or she
|
||||
will not (i) exercise any of his or her remaining Copyright and Related
|
||||
Rights in the Work or (ii) assert any associated claims and causes of
|
||||
action with respect to the Work, in either case contrary to Affirmer's
|
||||
express Statement of Purpose.
|
||||
|
||||
4. Limitations and Disclaimers.
|
||||
|
||||
a. No trademark or patent rights held by Affirmer are waived, abandoned,
|
||||
surrendered, licensed or otherwise affected by this document.
|
||||
b. Affirmer offers the Work as-is and makes no representations or
|
||||
warranties of any kind concerning the Work, express, implied,
|
||||
statutory or otherwise, including without limitation warranties of
|
||||
title, merchantability, fitness for a particular purpose, non
|
||||
infringement, or the absence of latent or other defects, accuracy, or
|
||||
the present or absence of errors, whether or not discoverable, all to
|
||||
the greatest extent permissible under applicable law.
|
||||
c. Affirmer disclaims responsibility for clearing rights of other persons
|
||||
that may apply to the Work or any use thereof, including without
|
||||
limitation any person's Copyright and Related Rights in the Work.
|
||||
Further, Affirmer disclaims responsibility for obtaining any necessary
|
||||
consents, permissions or other rights required for any use of the
|
||||
Work.
|
||||
d. Affirmer understands and acknowledges that Creative Commons is not a
|
||||
party to this document and has no duty or obligation with respect to
|
||||
this CC0 or use of the Work.
|
File diff suppressed because it is too large
Load Diff
@ -1,49 +0,0 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "pybind11/numpy.h"
|
||||
#include "feat/feature-window.h"
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace kaldi {
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
template <class F>
|
||||
class StreamingFeatureTpl {
|
||||
public:
|
||||
typedef typename F::Options Options;
|
||||
StreamingFeatureTpl(const Options& opts);
|
||||
bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
|
||||
::kaldi::Vector<::kaldi::BaseFloat>* feats);
|
||||
void Reset() { remained_wav_.Resize(0); }
|
||||
|
||||
int Dim() { return computer_.Dim(); }
|
||||
|
||||
private:
|
||||
bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
|
||||
::kaldi::Vector<::kaldi::BaseFloat>* feats);
|
||||
Options opts_;
|
||||
::kaldi::FeatureWindowFunction window_function_;
|
||||
::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
|
||||
F computer_;
|
||||
};
|
||||
|
||||
} // namespace kaldi
|
||||
} // namespace ppspeech
|
||||
|
||||
#include "feature_common_inl.h"
|
@ -1,93 +0,0 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace kaldi {
|
||||
|
||||
template <class F>
|
||||
StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
|
||||
: opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
|
||||
// window_function_(computer_.GetFrameOptions()) { the opt set to zero
|
||||
}
|
||||
|
||||
template <class F>
|
||||
bool StreamingFeatureTpl<F>::ComputeFeature(
|
||||
const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
|
||||
::kaldi::Vector<::kaldi::BaseFloat>* feats) {
|
||||
// append remaned waves
|
||||
::kaldi::int32 wav_len = wav.Dim();
|
||||
if (wav_len == 0) return false;
|
||||
::kaldi::int32 left_len = remained_wav_.Dim();
|
||||
::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
|
||||
waves.Range(0, left_len).CopyFromVec(remained_wav_);
|
||||
waves.Range(left_len, wav_len).CopyFromVec(wav);
|
||||
|
||||
// cache remaned waves
|
||||
::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
|
||||
::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
|
||||
::kaldi::int32 frame_shift = frame_opts.WindowShift();
|
||||
::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
|
||||
remained_wav_.Resize(left_samples);
|
||||
remained_wav_.CopyFromVec(
|
||||
waves.Range(frame_shift * num_frames, left_samples));
|
||||
|
||||
// compute speech feature
|
||||
Compute(waves, feats);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Compute feat
|
||||
template <class F>
|
||||
bool StreamingFeatureTpl<F>::Compute(
|
||||
const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
|
||||
::kaldi::Vector<::kaldi::BaseFloat>* feats) {
|
||||
::kaldi::BaseFloat vtln_warp = 1.0;
|
||||
const ::kaldi::FrameExtractionOptions& frame_opts =
|
||||
computer_.GetFrameOptions();
|
||||
::kaldi::int32 num_samples = waves.Dim();
|
||||
::kaldi::int32 frame_length = frame_opts.WindowSize();
|
||||
::kaldi::int32 sample_rate = frame_opts.samp_freq;
|
||||
if (num_samples < frame_length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
|
||||
feats->Resize(num_frames * Dim());
|
||||
|
||||
::kaldi::Vector<::kaldi::BaseFloat> window;
|
||||
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
|
||||
for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
|
||||
::kaldi::BaseFloat raw_log_energy = 0.0;
|
||||
::kaldi::ExtractWindow(0,
|
||||
waves,
|
||||
frame,
|
||||
frame_opts,
|
||||
window_function_,
|
||||
&window,
|
||||
need_raw_log_energy ? &raw_log_energy : NULL);
|
||||
|
||||
::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
|
||||
::kaldi::kUndefined);
|
||||
computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
|
||||
::kaldi::SubVector<::kaldi::BaseFloat> output_row(
|
||||
feats->Data() + frame * Dim(), Dim());
|
||||
output_row.CopyFromVec(this_feature);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace kaldi
|
||||
} // namespace paddleaudio
|
@ -1,75 +0,0 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
|
||||
#include "feat/pitch-functions.h"
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace kaldi {
|
||||
|
||||
bool InitFbank(
|
||||
::kaldi::FrameExtractionOptions frame_opts,
|
||||
::kaldi::MelBanksOptions mel_opts,
|
||||
FbankOptions fbank_opts) {
|
||||
::kaldi::FbankOptions opts;
|
||||
opts.frame_opts = frame_opts;
|
||||
opts.mel_opts = mel_opts;
|
||||
opts.use_energy = fbank_opts.use_energy;
|
||||
opts.energy_floor = fbank_opts.energy_floor;
|
||||
opts.raw_energy = fbank_opts.raw_energy;
|
||||
opts.htk_compat = fbank_opts.htk_compat;
|
||||
opts.use_log_fbank = fbank_opts.use_log_fbank;
|
||||
opts.use_power = fbank_opts.use_power;
|
||||
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->InitFbank(opts);
|
||||
return true;
|
||||
}
|
||||
|
||||
py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
|
||||
return paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ComputeFbank(
|
||||
wav);
|
||||
}
|
||||
|
||||
py::array_t<float> ComputeFbank(
|
||||
::kaldi::FrameExtractionOptions frame_opts,
|
||||
::kaldi::MelBanksOptions mel_opts,
|
||||
FbankOptions fbank_opts,
|
||||
const py::array_t<float>& wav) {
|
||||
InitFbank(frame_opts, mel_opts, fbank_opts);
|
||||
py::array_t<float> result = ComputeFbankStreaming(wav);
|
||||
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
|
||||
return result;
|
||||
}
|
||||
|
||||
void ResetFbank() {
|
||||
paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
|
||||
}
|
||||
|
||||
py::array_t<float> ComputeKaldiPitch(
|
||||
const ::kaldi::PitchExtractionOptions& opts,
|
||||
const py::array_t<float>& wav) {
|
||||
py::buffer_info info = wav.request();
|
||||
::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
|
||||
|
||||
::kaldi::Matrix<::kaldi::BaseFloat> features;
|
||||
::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
|
||||
auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
|
||||
for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
|
||||
std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
|
||||
sizeof(float)*features.NumCols());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace kaldi
|
||||
} // namespace paddleaudio
|
@ -1,64 +0,0 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <pybind11/numpy.h>
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <string>
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.h"
|
||||
#include "feat/pitch-functions.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace kaldi {
|
||||
|
||||
struct FbankOptions{
|
||||
bool use_energy; // append an extra dimension with energy to the filter banks
|
||||
float energy_floor;
|
||||
bool raw_energy; // If true, compute energy before preemphasis and windowing
|
||||
bool htk_compat; // If true, put energy last (if using energy)
|
||||
bool use_log_fbank; // if true (default), produce log-filterbank, else linear
|
||||
bool use_power;
|
||||
FbankOptions(): use_energy(false),
|
||||
energy_floor(0.0),
|
||||
raw_energy(true),
|
||||
htk_compat(false),
|
||||
use_log_fbank(true),
|
||||
use_power(true) {}
|
||||
};
|
||||
|
||||
bool InitFbank(
|
||||
::kaldi::FrameExtractionOptions frame_opts,
|
||||
::kaldi::MelBanksOptions mel_opts,
|
||||
FbankOptions fbank_opts);
|
||||
|
||||
py::array_t<float> ComputeFbank(
|
||||
::kaldi::FrameExtractionOptions frame_opts,
|
||||
::kaldi::MelBanksOptions mel_opts,
|
||||
FbankOptions fbank_opts,
|
||||
const py::array_t<float>& wav);
|
||||
|
||||
py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
|
||||
|
||||
void ResetFbank();
|
||||
|
||||
py::array_t<float> ComputeKaldiPitch(
|
||||
const ::kaldi::PitchExtractionOptions& opts,
|
||||
const py::array_t<float>& wav);
|
||||
|
||||
} // namespace kaldi
|
||||
} // namespace paddleaudio
|
@ -1,51 +0,0 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature_wrapper.h"
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace kaldi {
|
||||
|
||||
KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
|
||||
static KaldiFeatureWrapper instance;
|
||||
return &instance;
|
||||
}
|
||||
|
||||
bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
|
||||
fbank_.reset(new Fbank(opts));
|
||||
return true;
|
||||
}
|
||||
|
||||
py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
|
||||
const py::array_t<float> wav) {
|
||||
py::buffer_info info = wav.request();
|
||||
::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
|
||||
|
||||
::kaldi::Vector<::kaldi::BaseFloat> feats;
|
||||
bool flag = fbank_->ComputeFeature(input_wav, &feats);
|
||||
if (flag == false || feats.Dim() == 0) return py::array_t<float>();
|
||||
auto result = py::array_t<float>(feats.Dim());
|
||||
py::buffer_info xs = result.request();
|
||||
std::cout << std::endl;
|
||||
float* res_ptr = (float*)xs.ptr;
|
||||
for (int idx = 0; idx < feats.Dim(); ++idx) {
|
||||
*res_ptr = feats(idx);
|
||||
res_ptr++;
|
||||
}
|
||||
|
||||
return result.reshape({feats.Dim() / Dim(), Dim()});
|
||||
}
|
||||
|
||||
} // namesapce kaldi
|
||||
} // namespace paddleaudio
|
@ -1,40 +0,0 @@
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "base/kaldi-common.h"
|
||||
#include "feat/feature-fbank.h"
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/kaldi/feature_common.h"
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace kaldi {
|
||||
|
||||
typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
|
||||
|
||||
class KaldiFeatureWrapper {
|
||||
public:
|
||||
static KaldiFeatureWrapper* GetInstance();
|
||||
bool InitFbank(::kaldi::FbankOptions opts);
|
||||
py::array_t<float> ComputeFbank(const py::array_t<float> wav);
|
||||
int Dim() { return fbank_->Dim(); }
|
||||
void ResetFbank() { fbank_->Reset(); }
|
||||
|
||||
private:
|
||||
std::unique_ptr<paddleaudio::kaldi::Fbank> fbank_;
|
||||
};
|
||||
|
||||
} // namespace kaldi
|
||||
} // namespace paddleaudio
|
@ -1,144 +0,0 @@
|
||||
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala), All rights reserved.
|
||||
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/kaldi/kaldi_feature.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/io.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/effects.h"
|
||||
#include "paddlespeech/audio/third_party/kaldi/feat/feature-fbank.h"
|
||||
|
||||
#include <pybind11/stl.h>
|
||||
#include <pybind11/pybind11.h>
|
||||
|
||||
// `tl::optional`
|
||||
namespace pybind11 { namespace detail {
|
||||
template <typename T>
|
||||
struct type_caster<tl::optional<T>> : optional_caster<tl::optional<T>> {};
|
||||
}}
|
||||
|
||||
PYBIND11_MODULE(_paddleaudio, m) {
|
||||
#ifdef INCLUDE_SOX
|
||||
m.def("get_info_file",
|
||||
&paddleaudio::sox_io::get_info_file,
|
||||
"Get metadata of audio file.");
|
||||
// support obj later
|
||||
m.def("get_info_fileobj",
|
||||
&paddleaudio::sox_io::get_info_fileobj,
|
||||
"Get metadata of audio in file object.");
|
||||
m.def("load_audio_fileobj",
|
||||
&paddleaudio::sox_io::load_audio_fileobj,
|
||||
"Load audio from file object.");
|
||||
m.def("save_audio_fileobj",
|
||||
&paddleaudio::sox_io::save_audio_fileobj,
|
||||
"Save audio to file obj.");
|
||||
|
||||
// sox io
|
||||
m.def("sox_io_get_info", &paddleaudio::sox_io::get_info_file);
|
||||
m.def(
|
||||
"sox_io_load_audio_file",
|
||||
&paddleaudio::sox_io::load_audio_file);
|
||||
m.def(
|
||||
"sox_io_save_audio_file",
|
||||
&paddleaudio::sox_io::save_audio_file);
|
||||
|
||||
// sox utils
|
||||
m.def("sox_utils_set_seed", &paddleaudio::sox_utils::set_seed);
|
||||
m.def(
|
||||
"sox_utils_set_verbosity",
|
||||
&paddleaudio::sox_utils::set_verbosity);
|
||||
m.def(
|
||||
"sox_utils_set_use_threads",
|
||||
&paddleaudio::sox_utils::set_use_threads);
|
||||
m.def(
|
||||
"sox_utils_set_buffer_size",
|
||||
&paddleaudio::sox_utils::set_buffer_size);
|
||||
m.def(
|
||||
"sox_utils_list_effects",
|
||||
&paddleaudio::sox_utils::list_effects);
|
||||
m.def(
|
||||
"sox_utils_list_read_formats",
|
||||
&paddleaudio::sox_utils::list_read_formats);
|
||||
m.def(
|
||||
"sox_utils_list_write_formats",
|
||||
&paddleaudio::sox_utils::list_write_formats);
|
||||
m.def(
|
||||
"sox_utils_get_buffer_size",
|
||||
&paddleaudio::sox_utils::get_buffer_size);
|
||||
|
||||
// effect
|
||||
m.def("apply_effects_fileobj",
|
||||
&paddleaudio::sox_effects::apply_effects_fileobj,
|
||||
"Decode audio data from file-like obj and apply effects.");
|
||||
m.def("sox_effects_initialize_sox_effects",
|
||||
&paddleaudio::sox_effects::initialize_sox_effects);
|
||||
m.def(
|
||||
"sox_effects_shutdown_sox_effects",
|
||||
&paddleaudio::sox_effects::shutdown_sox_effects);
|
||||
m.def(
|
||||
"sox_effects_apply_effects_tensor",
|
||||
&paddleaudio::sox_effects::apply_effects_tensor);
|
||||
m.def(
|
||||
"sox_effects_apply_effects_file",
|
||||
&paddleaudio::sox_effects::apply_effects_file);
|
||||
#endif
|
||||
|
||||
#ifdef INCLUDE_KALDI
|
||||
m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
|
||||
py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
|
||||
.def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
|
||||
.def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
|
||||
.def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
|
||||
.def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
|
||||
.def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
|
||||
.def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
|
||||
.def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
|
||||
.def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
|
||||
.def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
|
||||
.def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
|
||||
.def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
|
||||
.def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
|
||||
.def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
|
||||
.def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
|
||||
.def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
|
||||
.def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
|
||||
.def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
|
||||
.def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
|
||||
.def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
|
||||
m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
|
||||
py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
|
||||
.def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)
|
||||
.def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
|
||||
.def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)
|
||||
.def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)
|
||||
.def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)
|
||||
.def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
|
||||
.def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)
|
||||
.def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)
|
||||
.def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
|
||||
.def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
|
||||
.def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
|
||||
.def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
|
||||
py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
|
||||
.def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
|
||||
.def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
|
||||
.def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
|
||||
.def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
|
||||
.def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
|
||||
.def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
|
||||
|
||||
py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
|
||||
.def(py::init<>())
|
||||
.def_readwrite("use_energy", &paddleaudio::kaldi::FbankOptions::use_energy)
|
||||
.def_readwrite("energy_floor", &paddleaudio::kaldi::FbankOptions::energy_floor)
|
||||
.def_readwrite("raw_energy", &paddleaudio::kaldi::FbankOptions::raw_energy)
|
||||
.def_readwrite("htk_compat", &paddleaudio::kaldi::FbankOptions::htk_compat)
|
||||
.def_readwrite("use_log_fbank", &paddleaudio::kaldi::FbankOptions::use_log_fbank)
|
||||
.def_readwrite("use_power", &paddleaudio::kaldi::FbankOptions::use_power);
|
||||
#endif
|
||||
|
||||
}
|
@ -1,257 +0,0 @@
|
||||
#include <mutex>
|
||||
#include <sox.h>
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/sox/effects.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/utils.h"
|
||||
|
||||
using namespace paddleaudio::sox_utils;
|
||||
|
||||
namespace paddleaudio::sox_effects {
|
||||
|
||||
// Streaming decoding over file-like object is tricky because libsox operates on
|
||||
// FILE pointer. The folloing is what `sox` and `play` commands do
|
||||
// - file input -> FILE pointer
|
||||
// - URL input -> call wget in suprocess and pipe the data -> FILE pointer
|
||||
// - stdin -> FILE pointer
|
||||
//
|
||||
// We want to, instead, fetch byte strings chunk by chunk, consume them, and
|
||||
// discard.
|
||||
//
|
||||
// Here is the approach
|
||||
// 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
|
||||
// chunk of byte string
|
||||
// This will perform header-based format detection, if necessary, then fill
|
||||
// the metadata of sox_format_t. Internally, sox_open_mem_read uses fmemopen,
|
||||
// which returns FILE* which points the buffer of the provided byte string.
|
||||
// 2. Each time sox reads a chunk from the FILE*, we update the underlying
|
||||
// buffer in a way that it
|
||||
// starts with unseen data, and append the new data read from the given
|
||||
// fileobj. This will trick libsox as if it keeps reading from the FILE*
|
||||
// continuously.
|
||||
// For Step 2. see `fileobj_input_drain` function in effects_chain.cpp
|
||||
auto apply_effects_fileobj(
|
||||
py::object fileobj,
|
||||
const std::vector<std::vector<std::string>>& effects,
|
||||
tl::optional<bool> normalize,
|
||||
tl::optional<bool> channels_first,
|
||||
tl::optional<std::string> format)
|
||||
-> tl::optional<std::tuple<py::array, int64_t>> {
|
||||
// Prepare the buffer used throughout the lifecycle of SoxEffectChain.
|
||||
//
|
||||
// For certain format (such as FLAC), libsox keeps reading the content at
|
||||
// the initialization unless it reaches EOF even when the header is properly
|
||||
// parsed. (Making buffer size 8192, which is way bigger than the header,
|
||||
// resulted in libsox consuming all the buffer content at the time it opens
|
||||
// the file.) Therefore buffer has to always contain valid data, except after
|
||||
// EOF. We default to `sox_get_globals()->bufsiz`* for buffer size and we
|
||||
// first check if there is enough data to fill the buffer. `read_fileobj`
|
||||
// repeatedly calls `read` method until it receives the requested length of
|
||||
// bytes or it reaches EOF. If we get bytes shorter than requested, that means
|
||||
// the whole audio data are fetched.
|
||||
//
|
||||
// * This can be changed with `paddleaudio.utils.sox_utils.set_buffer_size`.
|
||||
const auto capacity = [&]() {
|
||||
// NOTE:
|
||||
// Use the abstraction provided by `libpaddleaudio` to access the global
|
||||
// config defined by libsox. Directly using `sox_get_globals` function will
|
||||
// end up retrieving the static variable defined in `_paddleaudio`, which is
|
||||
// not correct.
|
||||
const auto bufsiz = get_buffer_size();
|
||||
const int64_t kDefaultCapacityInBytes = 256;
|
||||
return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
|
||||
: kDefaultCapacityInBytes;
|
||||
}();
|
||||
std::string buffer(capacity, '\0');
|
||||
auto* in_buf = const_cast<char*>(buffer.data());
|
||||
auto num_read = read_fileobj(&fileobj, capacity, in_buf);
|
||||
// If the file is shorter than 256, then libsox cannot read the header.
|
||||
auto in_buffer_size = (num_read > 256) ? num_read : 256;
|
||||
|
||||
// Open file (this starts reading the header)
|
||||
// When opening a file there are two functions that can touches FILE*.
|
||||
// * `auto_detect_format`
|
||||
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L43
|
||||
// * `startread` handler of detected format.
|
||||
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L574
|
||||
// To see the handler of a particular format, go to
|
||||
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/<FORMAT>.c
|
||||
// For example, voribs can be found
|
||||
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/vorbis.c#L97-L158
|
||||
SoxFormat sf(sox_open_mem_read(
|
||||
in_buf,
|
||||
in_buffer_size,
|
||||
/*signal=*/nullptr,
|
||||
/*encoding=*/nullptr,
|
||||
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
|
||||
|
||||
// In case of streamed data, length can be 0
|
||||
if (static_cast<sox_format_t*>(sf) == nullptr ||
|
||||
sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Prepare output buffer
|
||||
std::vector<sox_sample_t> out_buffer;
|
||||
out_buffer.reserve(sf->signal.length);
|
||||
|
||||
// Create and run SoxEffectsChain
|
||||
const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
|
||||
paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
|
||||
/*input_encoding=*/sf->encoding,
|
||||
/*output_encoding=*/get_tensor_encodinginfo(dtype));
|
||||
chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
|
||||
for (const auto& effect : effects) {
|
||||
chain.addEffect(effect);
|
||||
}
|
||||
chain.addOutputBuffer(&out_buffer);
|
||||
chain.run();
|
||||
|
||||
// Create tensor from buffer
|
||||
bool channels_first_ = channels_first.value_or(true);
|
||||
auto tensor = convert_to_tensor(
|
||||
/*buffer=*/out_buffer.data(),
|
||||
/*num_samples=*/out_buffer.size(),
|
||||
/*num_channels=*/chain.getOutputNumChannels(),
|
||||
dtype,
|
||||
normalize.value_or(true),
|
||||
channels_first_);
|
||||
|
||||
return std::forward_as_tuple(
|
||||
tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
|
||||
SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
|
||||
std::mutex SOX_RESOUCE_STATE_MUTEX;
|
||||
|
||||
} // namespace
|
||||
|
||||
void initialize_sox_effects() {
|
||||
const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
|
||||
|
||||
switch (SOX_RESOURCE_STATE) {
|
||||
case NotInitialized:
|
||||
if (sox_init() != SOX_SUCCESS) {
|
||||
throw std::runtime_error("Failed to initialize sox effects.");
|
||||
};
|
||||
SOX_RESOURCE_STATE = Initialized;
|
||||
break;
|
||||
case Initialized:
|
||||
break;
|
||||
case ShutDown:
|
||||
throw std::runtime_error(
|
||||
"SoX Effects has been shut down. Cannot initialize again.");
|
||||
}
|
||||
};
|
||||
|
||||
void shutdown_sox_effects() {
|
||||
const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);
|
||||
|
||||
switch (SOX_RESOURCE_STATE) {
|
||||
case NotInitialized:
|
||||
throw std::runtime_error(
|
||||
"SoX Effects is not initialized. Cannot shutdown.");
|
||||
case Initialized:
|
||||
if (sox_quit() != SOX_SUCCESS) {
|
||||
throw std::runtime_error("Failed to initialize sox effects.");
|
||||
};
|
||||
SOX_RESOURCE_STATE = ShutDown;
|
||||
break;
|
||||
case ShutDown:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto apply_effects_tensor(
|
||||
py::array waveform,
|
||||
int64_t sample_rate,
|
||||
const std::vector<std::vector<std::string>>& effects,
|
||||
bool channels_first) -> std::tuple<py::array, int64_t> {
|
||||
validate_input_tensor(waveform);
|
||||
|
||||
// Create SoxEffectsChain
|
||||
const auto dtype = waveform.dtype();
|
||||
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
|
||||
/*input_encoding=*/get_tensor_encodinginfo(dtype),
|
||||
/*output_encoding=*/get_tensor_encodinginfo(dtype));
|
||||
|
||||
// Prepare output buffer
|
||||
std::vector<sox_sample_t> out_buffer;
|
||||
out_buffer.reserve(waveform.size());
|
||||
|
||||
// Build and run effects chain
|
||||
chain.addInputTensor(&waveform, sample_rate, channels_first);
|
||||
for (const auto& effect : effects) {
|
||||
chain.addEffect(effect);
|
||||
}
|
||||
chain.addOutputBuffer(&out_buffer);
|
||||
chain.run();
|
||||
|
||||
// Create tensor from buffer
|
||||
auto out_tensor = convert_to_tensor(
|
||||
/*buffer=*/out_buffer.data(),
|
||||
/*num_samples=*/out_buffer.size(),
|
||||
/*num_channels=*/chain.getOutputNumChannels(),
|
||||
dtype,
|
||||
/*normalize=*/false,
|
||||
channels_first);
|
||||
|
||||
return std::tuple<py::array, int64_t>(
|
||||
out_tensor, chain.getOutputSampleRate());
|
||||
}
|
||||
|
||||
auto apply_effects_file(
|
||||
const std::string& path,
|
||||
const std::vector<std::vector<std::string>>& effects,
|
||||
tl::optional<bool> normalize,
|
||||
tl::optional<bool> channels_first,
|
||||
const tl::optional<std::string>& format)
|
||||
-> tl::optional<std::tuple<py::array, int64_t>> {
|
||||
// Open input file
|
||||
SoxFormat sf(sox_open_read(
|
||||
path.c_str(),
|
||||
/*signal=*/nullptr,
|
||||
/*encoding=*/nullptr,
|
||||
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
|
||||
|
||||
if (static_cast<sox_format_t*>(sf) == nullptr ||
|
||||
sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
|
||||
|
||||
// Prepare output
|
||||
std::vector<sox_sample_t> out_buffer;
|
||||
out_buffer.reserve(sf->signal.length);
|
||||
|
||||
// Create and run SoxEffectsChain
|
||||
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
|
||||
/*input_encoding=*/sf->encoding,
|
||||
/*output_encoding=*/get_tensor_encodinginfo(dtype));
|
||||
|
||||
chain.addInputFile(sf);
|
||||
for (const auto& effect : effects) {
|
||||
chain.addEffect(effect);
|
||||
}
|
||||
chain.addOutputBuffer(&out_buffer);
|
||||
chain.run();
|
||||
|
||||
// Create tensor from buffer
|
||||
bool channels_first_ = channels_first.value_or(true);
|
||||
auto tensor = convert_to_tensor(
|
||||
/*buffer=*/out_buffer.data(),
|
||||
/*num_samples=*/out_buffer.size(),
|
||||
/*num_channels=*/chain.getOutputNumChannels(),
|
||||
dtype,
|
||||
normalize.value_or(true),
|
||||
channels_first_);
|
||||
|
||||
return std::tuple<py::array, int64_t>(
|
||||
tensor, chain.getOutputSampleRate());
|
||||
}
|
||||
|
||||
} // namespace paddleaudio::sox_effects
|
@ -1,36 +0,0 @@
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/numpy.h>
|
||||
|
||||
#include "paddlespeech/audio/src/optional/optional.hpp"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace paddleaudio::sox_effects {
|
||||
|
||||
auto apply_effects_fileobj(
|
||||
py::object fileobj,
|
||||
const std::vector<std::vector<std::string>>& effects,
|
||||
tl::optional<bool> normalize,
|
||||
tl::optional<bool> channels_first,
|
||||
tl::optional<std::string> format)
|
||||
-> tl::optional<std::tuple<py::array, int64_t>>;
|
||||
|
||||
void initialize_sox_effects();
|
||||
|
||||
void shutdown_sox_effects();
|
||||
|
||||
auto apply_effects_tensor(
|
||||
py::array waveform,
|
||||
int64_t sample_rate,
|
||||
const std::vector<std::vector<std::string>>& effects,
|
||||
bool channels_first) -> std::tuple<py::array, int64_t>;
|
||||
|
||||
auto apply_effects_file(
|
||||
const std::string& path,
|
||||
const std::vector<std::vector<std::string>>& effects,
|
||||
tl::optional<bool> normalize,
|
||||
tl::optional<bool> channels_first,
|
||||
const tl::optional<std::string>& format)
|
||||
-> tl::optional<std::tuple<py::array, int64_t>>;
|
||||
|
||||
} // namespace paddleaudio::sox_effects
|
@ -1,595 +0,0 @@
|
||||
#include <sox.h>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/utils.h"
|
||||
|
||||
using namespace paddleaudio::sox_utils;
|
||||
|
||||
namespace paddleaudio::sox_effects_chain {
|
||||
|
||||
namespace {
|
||||
|
||||
/// helper classes for passing the location of input tensor and output buffer
|
||||
///
|
||||
/// drain/flow callback functions require plaing C style function signature and
|
||||
/// the way to pass extra data is to attach data to sox_effect_t::priv pointer.
|
||||
/// The following structs will be assigned to sox_effect_t::priv pointer which
|
||||
/// gives sox_effect_t an access to input Tensor and output buffer object.
|
||||
struct TensorInputPriv {
|
||||
size_t index;
|
||||
py::array* waveform;
|
||||
int64_t sample_rate;
|
||||
bool channels_first;
|
||||
};
|
||||
|
||||
struct TensorOutputPriv {
|
||||
std::vector<sox_sample_t>* buffer;
|
||||
};
|
||||
struct FileOutputPriv {
|
||||
sox_format_t* sf;
|
||||
};
|
||||
|
||||
/// Callback function to feed Tensor data to SoxEffectChain.
|
||||
int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) {
|
||||
// Retrieve the input Tensor and current index
|
||||
auto priv = static_cast<TensorInputPriv*>(effp->priv);
|
||||
auto index = priv->index;
|
||||
auto tensor = *(priv->waveform);
|
||||
auto num_channels = effp->out_signal.channels;
|
||||
|
||||
// Adjust the number of samples to read
|
||||
const size_t num_samples = tensor.size();
|
||||
if (index + *osamp > num_samples) {
|
||||
*osamp = num_samples - index;
|
||||
}
|
||||
|
||||
// Ensure that it's a multiple of the number of channels
|
||||
*osamp -= *osamp % num_channels;
|
||||
|
||||
// Slice the input Tensor
|
||||
// refacor this module, chunk
|
||||
auto i_frame = index / num_channels;
|
||||
auto num_frames = *osamp / num_channels;
|
||||
|
||||
std::vector<int> chunk(num_frames*num_channels);
|
||||
py::buffer_info ori_info = tensor.request();
|
||||
void* ptr = ori_info.ptr;
|
||||
// Convert to sox_sample_t (int32_t)
|
||||
switch (tensor.dtype().num()) {
|
||||
//case c10::ScalarType::Float: {
|
||||
case 11: {
|
||||
// Need to convert to 64-bit precision so that
|
||||
// values around INT32_MIN/MAX are handled correctly.
|
||||
for (int idx = 0; idx < chunk.size(); ++idx) {
|
||||
int frame_idx = (idx + index) / num_channels;
|
||||
int channels_idx = (idx + index) % num_channels;
|
||||
double elem = 0;
|
||||
if (priv->channels_first) {
|
||||
elem = *(float*)tensor.data(channels_idx, frame_idx);
|
||||
} else {
|
||||
elem = *(float*)tensor.data(frame_idx, channels_idx);
|
||||
}
|
||||
elem = elem * 2147483648.;
|
||||
// *new_ptr = std::clamp(elem, INT32_MIN, INT32_MAX);
|
||||
if (elem > INT32_MAX) {
|
||||
chunk[idx] = INT32_MAX;
|
||||
} else if (elem < INT32_MIN) {
|
||||
chunk[idx] = INT32_MIN;
|
||||
} else {
|
||||
chunk[idx] = elem;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
//case c10::ScalarType::Int: {
|
||||
case 5: {
|
||||
for (int idx = 0; idx < chunk.size(); ++idx) {
|
||||
int frame_idx = (idx + index) / num_channels;
|
||||
int channels_idx = (idx + index) % num_channels;
|
||||
int elem = 0;
|
||||
if (priv->channels_first) {
|
||||
elem = *(int*)tensor.data(channels_idx, frame_idx);
|
||||
} else {
|
||||
elem = *(int*)tensor.data(frame_idx, channels_idx);
|
||||
}
|
||||
chunk[idx] = elem;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// case short
|
||||
case 3: {
|
||||
for (int idx = 0; idx < chunk.size(); ++idx) {
|
||||
int frame_idx = (idx + index) / num_channels;
|
||||
int channels_idx = (idx + index) % num_channels;
|
||||
int16_t elem = 0;
|
||||
if (priv->channels_first) {
|
||||
elem = *(int16_t*)tensor.data(channels_idx, frame_idx);
|
||||
} else {
|
||||
elem = *(int16_t*)tensor.data(frame_idx, channels_idx);
|
||||
}
|
||||
chunk[idx] = elem * 65536;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// case byte
|
||||
case 1: {
|
||||
for (int idx = 0; idx < chunk.size(); ++idx) {
|
||||
int frame_idx = (idx + index) / num_channels;
|
||||
int channels_idx = (idx + index) % num_channels;
|
||||
int8_t elem = 0;
|
||||
if (priv->channels_first) {
|
||||
elem = *(int8_t*)tensor.data(channels_idx, frame_idx);
|
||||
} else {
|
||||
elem = *(int8_t*)tensor.data(frame_idx, channels_idx);
|
||||
}
|
||||
chunk[idx] = (elem - 128) * 16777216;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error("Unexpected dtype.");
|
||||
}
|
||||
// Write to buffer
|
||||
memcpy(obuf, chunk.data(), *osamp * 4);
|
||||
priv->index += *osamp;
|
||||
return (priv->index == num_samples) ? SOX_EOF : SOX_SUCCESS;
|
||||
}
|
||||
|
||||
/// Callback function to fetch data from SoxEffectChain.
|
||||
int tensor_output_flow(
|
||||
sox_effect_t* effp,
|
||||
sox_sample_t const* ibuf,
|
||||
sox_sample_t* obuf LSX_UNUSED,
|
||||
size_t* isamp,
|
||||
size_t* osamp) {
|
||||
*osamp = 0;
|
||||
// Get output buffer
|
||||
auto out_buffer = static_cast<TensorOutputPriv*>(effp->priv)->buffer;
|
||||
// Append at the end
|
||||
out_buffer->insert(out_buffer->end(), ibuf, ibuf + *isamp);
|
||||
return SOX_SUCCESS;
|
||||
}
|
||||
|
||||
int file_output_flow(
|
||||
sox_effect_t* effp,
|
||||
sox_sample_t const* ibuf,
|
||||
sox_sample_t* obuf LSX_UNUSED,
|
||||
size_t* isamp,
|
||||
size_t* osamp) {
|
||||
*osamp = 0;
|
||||
if (*isamp) {
|
||||
auto sf = static_cast<FileOutputPriv*>(effp->priv)->sf;
|
||||
if (sox_write(sf, ibuf, *isamp) != *isamp) {
|
||||
if (sf->sox_errno) {
|
||||
std::ostringstream stream;
|
||||
stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
|
||||
<< sf->filename;
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
return SOX_EOF;
|
||||
}
|
||||
}
|
||||
return SOX_SUCCESS;
|
||||
}
|
||||
|
||||
sox_effect_handler_t* get_tensor_input_handler() {
|
||||
static sox_effect_handler_t handler{
|
||||
/*name=*/"input_tensor",
|
||||
/*usage=*/NULL,
|
||||
/*flags=*/SOX_EFF_MCHAN,
|
||||
/*getopts=*/NULL,
|
||||
/*start=*/NULL,
|
||||
/*flow=*/NULL,
|
||||
/*drain=*/tensor_input_drain,
|
||||
/*stop=*/NULL,
|
||||
/*kill=*/NULL,
|
||||
/*priv_size=*/sizeof(TensorInputPriv)};
|
||||
return &handler;
|
||||
}
|
||||
|
||||
sox_effect_handler_t* get_tensor_output_handler() {
|
||||
static sox_effect_handler_t handler{
|
||||
/*name=*/"output_tensor",
|
||||
/*usage=*/NULL,
|
||||
/*flags=*/SOX_EFF_MCHAN,
|
||||
/*getopts=*/NULL,
|
||||
/*start=*/NULL,
|
||||
/*flow=*/tensor_output_flow,
|
||||
/*drain=*/NULL,
|
||||
/*stop=*/NULL,
|
||||
/*kill=*/NULL,
|
||||
/*priv_size=*/sizeof(TensorOutputPriv)};
|
||||
return &handler;
|
||||
}
|
||||
|
||||
sox_effect_handler_t* get_file_output_handler() {
|
||||
static sox_effect_handler_t handler{
|
||||
/*name=*/"output_file",
|
||||
/*usage=*/NULL,
|
||||
/*flags=*/SOX_EFF_MCHAN,
|
||||
/*getopts=*/NULL,
|
||||
/*start=*/NULL,
|
||||
/*flow=*/file_output_flow,
|
||||
/*drain=*/NULL,
|
||||
/*stop=*/NULL,
|
||||
/*kill=*/NULL,
|
||||
/*priv_size=*/sizeof(FileOutputPriv)};
|
||||
return &handler;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
SoxEffect::SoxEffect(sox_effect_t* se) noexcept : se_(se) {}
|
||||
|
||||
SoxEffect::~SoxEffect() {
|
||||
if (se_ != nullptr) {
|
||||
free(se_);
|
||||
}
|
||||
}
|
||||
|
||||
SoxEffect::operator sox_effect_t*() const {
|
||||
return se_;
|
||||
}
|
||||
|
||||
auto SoxEffect::operator->() noexcept -> sox_effect_t* {
|
||||
return se_;
|
||||
}
|
||||
|
||||
SoxEffectsChain::SoxEffectsChain(
|
||||
sox_encodinginfo_t input_encoding,
|
||||
sox_encodinginfo_t output_encoding)
|
||||
: in_enc_(input_encoding),
|
||||
out_enc_(output_encoding),
|
||||
in_sig_(),
|
||||
interm_sig_(),
|
||||
out_sig_(),
|
||||
sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) {
|
||||
if (!sec_) {
|
||||
throw std::runtime_error("Failed to create effect chain.");
|
||||
}
|
||||
}
|
||||
|
||||
SoxEffectsChain::~SoxEffectsChain() {
|
||||
if (sec_ != nullptr) {
|
||||
sox_delete_effects_chain(sec_);
|
||||
}
|
||||
}
|
||||
|
||||
void SoxEffectsChain::run() {
|
||||
sox_flow_effects(sec_, NULL, NULL);
|
||||
}
|
||||
|
||||
void SoxEffectsChain::addInputTensor(
|
||||
py::array* waveform,
|
||||
int64_t sample_rate,
|
||||
bool channels_first) {
|
||||
in_sig_ = get_signalinfo(waveform, sample_rate, "wav", channels_first);
|
||||
interm_sig_ = in_sig_;
|
||||
SoxEffect e(sox_create_effect(get_tensor_input_handler()));
|
||||
auto priv = static_cast<TensorInputPriv*>(e->priv);
|
||||
priv->index = 0;
|
||||
priv->waveform = waveform;
|
||||
priv->sample_rate = sample_rate;
|
||||
priv->channels_first = channels_first;
|
||||
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||
throw std::runtime_error(
|
||||
"Internal Error: Failed to add effect: input_tensor");
|
||||
}
|
||||
}
|
||||
|
||||
void SoxEffectsChain::addOutputBuffer(
|
||||
std::vector<sox_sample_t>* output_buffer) {
|
||||
SoxEffect e(sox_create_effect(get_tensor_output_handler()));
|
||||
static_cast<TensorOutputPriv*>(e->priv)->buffer = output_buffer;
|
||||
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||
throw std::runtime_error(
|
||||
"Internal Error: Failed to add effect: output_tensor");
|
||||
}
|
||||
}
|
||||
|
||||
void SoxEffectsChain::addInputFile(sox_format_t* sf) {
|
||||
in_sig_ = sf->signal;
|
||||
interm_sig_ = in_sig_;
|
||||
SoxEffect e(sox_create_effect(sox_find_effect("input")));
|
||||
char* opts[] = {(char*)sf};
|
||||
sox_effect_options(e, 1, opts);
|
||||
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||
std::ostringstream stream;
|
||||
stream << "Internal Error: Failed to add effect: input " << sf->filename;
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
}
|
||||
|
||||
void SoxEffectsChain::addOutputFile(sox_format_t* sf) {
|
||||
out_sig_ = sf->signal;
|
||||
SoxEffect e(sox_create_effect(get_file_output_handler()));
|
||||
static_cast<FileOutputPriv*>(e->priv)->sf = sf;
|
||||
if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
|
||||
std::ostringstream stream;
|
||||
stream << "Internal Error: Failed to add effect: output " << sf->filename;
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
}
|
||||
|
||||
void SoxEffectsChain::addEffect(const std::vector<std::string> effect) {
|
||||
const auto num_args = effect.size();
|
||||
if (num_args == 0) {
|
||||
throw std::runtime_error("Invalid argument: empty effect.");
|
||||
}
|
||||
const auto name = effect[0];
|
||||
if (UNSUPPORTED_EFFECTS.find(name) != UNSUPPORTED_EFFECTS.end()) {
|
||||
std::ostringstream stream;
|
||||
stream << "Unsupported effect: " << name;
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
|
||||
auto returned_effect = sox_find_effect(name.c_str());
|
||||
if (!returned_effect) {
|
||||
std::ostringstream stream;
|
||||
stream << "Unsupported effect: " << name;
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
SoxEffect e(sox_create_effect(returned_effect));
|
||||
const auto num_options = num_args - 1;
|
||||
|
||||
std::vector<char*> opts;
|
||||
for (size_t i = 1; i < num_args; ++i) {
|
||||
opts.push_back((char*)effect[i].c_str());
|
||||
}
|
||||
if (sox_effect_options(e, num_options, num_options ? opts.data() : nullptr) !=
|
||||
SOX_SUCCESS) {
|
||||
std::ostringstream stream;
|
||||
stream << "Invalid effect option:";
|
||||
for (const auto& v : effect) {
|
||||
stream << " " << v;
|
||||
}
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
|
||||
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||
std::ostringstream stream;
|
||||
stream << "Internal Error: Failed to add effect: \"" << name;
|
||||
for (size_t i = 1; i < num_args; ++i) {
|
||||
stream << " " << effect[i];
|
||||
}
|
||||
stream << "\"";
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
}
|
||||
|
||||
int64_t SoxEffectsChain::getOutputNumChannels() {
|
||||
return interm_sig_.channels;
|
||||
}
|
||||
|
||||
int64_t SoxEffectsChain::getOutputSampleRate() {
|
||||
return interm_sig_.rate;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
/// helper classes for passing file-like object to SoxEffectChain
|
||||
struct FileObjInputPriv {
|
||||
sox_format_t* sf;
|
||||
py::object* fileobj;
|
||||
bool eof_reached;
|
||||
char* buffer;
|
||||
uint64_t buffer_size;
|
||||
};
|
||||
|
||||
struct FileObjOutputPriv {
|
||||
sox_format_t* sf;
|
||||
py::object* fileobj;
|
||||
char** buffer;
|
||||
size_t* buffer_size;
|
||||
};
|
||||
|
||||
/// Callback function to feed byte string
|
||||
/// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/sox.h#L1268-L1278
|
||||
auto fileobj_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp)
|
||||
-> int {
|
||||
auto priv = static_cast<FileObjInputPriv*>(effp->priv);
|
||||
auto sf = priv->sf;
|
||||
auto buffer = priv->buffer;
|
||||
|
||||
// 1. Refresh the buffer
|
||||
//
|
||||
// NOTE:
|
||||
// Since the underlying FILE* was opened with `fmemopen`, the only way
|
||||
// libsox detect EOF is reaching the end of the buffer. (null byte won't
|
||||
// help) Therefore we need to align the content at the end of buffer,
|
||||
// otherwise, libsox will keep reading the content beyond intended length.
|
||||
//
|
||||
// Before:
|
||||
//
|
||||
// |<-------consumed------>|<---remaining--->|
|
||||
// |***********************|-----------------|
|
||||
// ^ ftell
|
||||
//
|
||||
// After:
|
||||
//
|
||||
// |<-offset->|<---remaining--->|<-new data->|
|
||||
// |**********|-----------------|++++++++++++|
|
||||
// ^ ftell
|
||||
|
||||
// NOTE:
|
||||
// Do not use `sf->tell_off` here. Presumably, `tell_off` and `fseek` are
|
||||
// supposed to be in sync, but there are cases (Vorbis) they are not
|
||||
// in sync and `tell_off` has seemingly uninitialized value, which
|
||||
// leads num_remain to be negative and cause segmentation fault
|
||||
// in `memmove`.
|
||||
const auto tell = ftell((FILE*)sf->fp);
|
||||
if (tell < 0) {
|
||||
throw std::runtime_error("Internal Error: ftell failed.");
|
||||
}
|
||||
const auto num_consumed = static_cast<size_t>(tell);
|
||||
if (num_consumed > priv->buffer_size) {
|
||||
throw std::runtime_error("Internal Error: buffer overrun.");
|
||||
}
|
||||
|
||||
const auto num_remain = priv->buffer_size - num_consumed;
|
||||
|
||||
// 1.1. Fetch the data to see if there is data to fill the buffer
|
||||
size_t num_refill = 0;
|
||||
std::string chunk(num_consumed, '\0');
|
||||
if (num_consumed && !priv->eof_reached) {
|
||||
num_refill = read_fileobj(
|
||||
priv->fileobj, num_consumed, const_cast<char*>(chunk.data()));
|
||||
if (num_refill < num_consumed) {
|
||||
priv->eof_reached = true;
|
||||
}
|
||||
}
|
||||
const auto offset = num_consumed - num_refill;
|
||||
|
||||
// 1.2. Move the unconsumed data towards the beginning of buffer.
|
||||
if (num_remain) {
|
||||
auto src = static_cast<void*>(buffer + num_consumed);
|
||||
auto dst = static_cast<void*>(buffer + offset);
|
||||
memmove(dst, src, num_remain);
|
||||
}
|
||||
|
||||
// 1.3. Refill the remaining buffer.
|
||||
if (num_refill) {
|
||||
auto src = static_cast<void*>(const_cast<char*>(chunk.c_str()));
|
||||
auto dst = buffer + offset + num_remain;
|
||||
memcpy(dst, src, num_refill);
|
||||
}
|
||||
|
||||
// 1.4. Set the file pointer to the new offset
|
||||
sf->tell_off = offset;
|
||||
fseek((FILE*)sf->fp, offset, SEEK_SET);
|
||||
|
||||
// 2. Perform decoding operation
|
||||
// The following part is practically same as "input" effect
|
||||
// https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/input.c#L30-L48
|
||||
|
||||
// At this point, osamp represents the buffer size in bytes,
|
||||
// but sox_read expects the maximum number of samples ready to read.
|
||||
// Normally, this is fine, but in case when the samples are not 4-byte
|
||||
// aligned, (e.g. sample is 24bits), the resulting signal is not correct.
|
||||
// https://github.com/pytorch/audio/issues/2083
|
||||
if (sf->encoding.bits_per_sample > 0)
|
||||
*osamp /= (sf->encoding.bits_per_sample / 8);
|
||||
|
||||
// Ensure that it's a multiple of the number of channels
|
||||
*osamp -= *osamp % effp->out_signal.channels;
|
||||
|
||||
// Read up to *osamp samples into obuf;
|
||||
// store the actual number read back to *osamp
|
||||
*osamp = sox_read(sf, obuf, *osamp);
|
||||
|
||||
// Decoding is finished when fileobject is exhausted and sox can no longer
|
||||
// decode a sample.
|
||||
return (priv->eof_reached && !*osamp) ? SOX_EOF : SOX_SUCCESS;
|
||||
}
|
||||
|
||||
auto fileobj_output_flow(
|
||||
sox_effect_t* effp,
|
||||
sox_sample_t const* ibuf,
|
||||
sox_sample_t* obuf LSX_UNUSED,
|
||||
size_t* isamp,
|
||||
size_t* osamp) -> int {
|
||||
*osamp = 0;
|
||||
if (*isamp) {
|
||||
auto priv = static_cast<FileObjOutputPriv*>(effp->priv);
|
||||
auto sf = priv->sf;
|
||||
auto fp = static_cast<FILE*>(sf->fp);
|
||||
auto fileobj = priv->fileobj;
|
||||
auto buffer = priv->buffer;
|
||||
|
||||
// Encode chunk
|
||||
auto num_samples_written = sox_write(sf, ibuf, *isamp);
|
||||
fflush(fp);
|
||||
|
||||
// Copy the encoded chunk to python object.
|
||||
fileobj->attr("write")(py::bytes(*buffer, ftell(fp)));
|
||||
|
||||
// Reset FILE*
|
||||
sf->tell_off = 0;
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
|
||||
if (num_samples_written != *isamp) {
|
||||
if (sf->sox_errno) {
|
||||
std::ostringstream stream;
|
||||
stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " "
|
||||
<< sf->filename;
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
return SOX_EOF;
|
||||
}
|
||||
}
|
||||
return SOX_SUCCESS;
|
||||
}
|
||||
|
||||
auto get_fileobj_input_handler() -> sox_effect_handler_t* {
|
||||
static sox_effect_handler_t handler{
|
||||
/*name=*/"input_fileobj_object",
|
||||
/*usage=*/nullptr,
|
||||
/*flags=*/SOX_EFF_MCHAN,
|
||||
/*getopts=*/nullptr,
|
||||
/*start=*/nullptr,
|
||||
/*flow=*/nullptr,
|
||||
/*drain=*/fileobj_input_drain,
|
||||
/*stop=*/nullptr,
|
||||
/*kill=*/nullptr,
|
||||
/*priv_size=*/sizeof(FileObjInputPriv)};
|
||||
return &handler;
|
||||
}
|
||||
|
||||
auto get_fileobj_output_handler() -> sox_effect_handler_t* {
|
||||
static sox_effect_handler_t handler{
|
||||
/*name=*/"output_fileobj_object",
|
||||
/*usage=*/nullptr,
|
||||
/*flags=*/SOX_EFF_MCHAN,
|
||||
/*getopts=*/nullptr,
|
||||
/*start=*/nullptr,
|
||||
/*flow=*/fileobj_output_flow,
|
||||
/*drain=*/nullptr,
|
||||
/*stop=*/nullptr,
|
||||
/*kill=*/nullptr,
|
||||
/*priv_size=*/sizeof(FileObjOutputPriv)};
|
||||
return &handler;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void SoxEffectsChainPyBind::addInputFileObj(
|
||||
sox_format_t* sf,
|
||||
char* buffer,
|
||||
uint64_t buffer_size,
|
||||
py::object* fileobj) {
|
||||
in_sig_ = sf->signal;
|
||||
interm_sig_ = in_sig_;
|
||||
|
||||
SoxEffect e(sox_create_effect(get_fileobj_input_handler()));
|
||||
auto priv = static_cast<FileObjInputPriv*>(e->priv);
|
||||
priv->sf = sf;
|
||||
priv->fileobj = fileobj;
|
||||
priv->eof_reached = false;
|
||||
priv->buffer = buffer;
|
||||
priv->buffer_size = buffer_size;
|
||||
if (sox_add_effect(sec_, e, &interm_sig_, &in_sig_) != SOX_SUCCESS) {
|
||||
throw std::runtime_error(
|
||||
"Internal Error: Failed to add effect: input fileobj");
|
||||
}
|
||||
}
|
||||
|
||||
void SoxEffectsChainPyBind::addOutputFileObj(
|
||||
sox_format_t* sf,
|
||||
char** buffer,
|
||||
size_t* buffer_size,
|
||||
py::object* fileobj) {
|
||||
out_sig_ = sf->signal;
|
||||
SoxEffect e(sox_create_effect(get_fileobj_output_handler()));
|
||||
auto priv = static_cast<FileObjOutputPriv*>(e->priv);
|
||||
priv->sf = sf;
|
||||
priv->fileobj = fileobj;
|
||||
priv->buffer = buffer;
|
||||
priv->buffer_size = buffer_size;
|
||||
if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) {
|
||||
throw std::runtime_error(
|
||||
"Internal Error: Failed to add effect: output fileobj");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace paddleaudio::sox_effects_chain
|
@ -1,76 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <sox.h>
|
||||
#include "paddlespeech/audio/src/pybind/sox/utils.h"
|
||||
|
||||
namespace paddleaudio::sox_effects_chain {
|
||||
|
||||
// Helper struct to safely close sox_effect_t* pointer returned by
|
||||
// sox_create_effect
|
||||
|
||||
struct SoxEffect {
|
||||
explicit SoxEffect(sox_effect_t* se) noexcept;
|
||||
SoxEffect(const SoxEffect& other) = delete;
|
||||
SoxEffect(const SoxEffect&& other) = delete;
|
||||
auto operator=(const SoxEffect& other) -> SoxEffect& = delete;
|
||||
auto operator=(SoxEffect&& other) -> SoxEffect& = delete;
|
||||
~SoxEffect();
|
||||
operator sox_effect_t*() const;
|
||||
auto operator->() noexcept -> sox_effect_t*;
|
||||
|
||||
private:
|
||||
sox_effect_t* se_;
|
||||
};
|
||||
|
||||
// Helper struct to safely close sox_effects_chain_t with handy methods
|
||||
class SoxEffectsChain {
|
||||
const sox_encodinginfo_t in_enc_;
|
||||
const sox_encodinginfo_t out_enc_;
|
||||
|
||||
protected:
|
||||
sox_signalinfo_t in_sig_;
|
||||
sox_signalinfo_t interm_sig_;
|
||||
sox_signalinfo_t out_sig_;
|
||||
sox_effects_chain_t* sec_;
|
||||
|
||||
public:
|
||||
explicit SoxEffectsChain(
|
||||
sox_encodinginfo_t input_encoding,
|
||||
sox_encodinginfo_t output_encoding);
|
||||
SoxEffectsChain(const SoxEffectsChain& other) = delete;
|
||||
SoxEffectsChain(const SoxEffectsChain&& other) = delete;
|
||||
SoxEffectsChain& operator=(const SoxEffectsChain& other) = delete;
|
||||
SoxEffectsChain& operator=(SoxEffectsChain&& other) = delete;
|
||||
~SoxEffectsChain();
|
||||
void run();
|
||||
void addInputTensor(
|
||||
py::array* waveform,
|
||||
int64_t sample_rate,
|
||||
bool channels_first);
|
||||
void addInputFile(sox_format_t* sf);
|
||||
void addOutputBuffer(std::vector<sox_sample_t>* output_buffer);
|
||||
void addOutputFile(sox_format_t* sf);
|
||||
void addEffect(const std::vector<std::string> effect);
|
||||
int64_t getOutputNumChannels();
|
||||
int64_t getOutputSampleRate();
|
||||
};
|
||||
|
||||
class SoxEffectsChainPyBind : public SoxEffectsChain {
|
||||
using SoxEffectsChain::SoxEffectsChain;
|
||||
|
||||
public:
|
||||
void addInputFileObj(
|
||||
sox_format_t* sf,
|
||||
char* buffer,
|
||||
uint64_t buffer_size,
|
||||
py::object* fileobj);
|
||||
|
||||
void addOutputFileObj(
|
||||
sox_format_t* sf,
|
||||
char** buffer,
|
||||
size_t* buffer_size,
|
||||
py::object* fileobj);
|
||||
};
|
||||
|
||||
} // namespace paddleaudio::sox_effects_chain
|
||||
|
@ -1,280 +0,0 @@
|
||||
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
|
||||
// All rights reserved.
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/sox/io.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/effects.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/types.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/effects_chain.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/utils.h"
|
||||
#include "paddlespeech/audio/src/optional/optional.hpp"
|
||||
|
||||
using namespace paddleaudio::sox_utils;
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace sox_io {
|
||||
|
||||
auto get_info_file(const std::string &path,
|
||||
const tl::optional<std::string> &format)
|
||||
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
|
||||
SoxFormat sf(
|
||||
sox_open_read(path.data(),
|
||||
/*signal=*/nullptr,
|
||||
/*encoding=*/nullptr,
|
||||
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
|
||||
|
||||
|
||||
validate_input_file(sf, path);
|
||||
|
||||
return std::make_tuple(
|
||||
static_cast<int64_t>(sf->signal.rate),
|
||||
static_cast<int64_t>(sf->signal.length / sf->signal.channels),
|
||||
static_cast<int64_t>(sf->signal.channels),
|
||||
static_cast<int64_t>(sf->encoding.bits_per_sample),
|
||||
get_encoding(sf->encoding.encoding));
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> get_effects(
|
||||
const tl::optional<int64_t>& frame_offset,
|
||||
const tl::optional<int64_t>& num_frames) {
|
||||
const auto offset = frame_offset.value_or(0);
|
||||
if (offset < 0) {
|
||||
throw std::runtime_error(
|
||||
"Invalid argument: frame_offset must be non-negative.");
|
||||
}
|
||||
const auto frames = num_frames.value_or(-1);
|
||||
if (frames == 0 || frames < -1) {
|
||||
throw std::runtime_error(
|
||||
"Invalid argument: num_frames must be -1 or greater than 0.");
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> effects;
|
||||
if (frames != -1) {
|
||||
std::ostringstream os_offset, os_frames;
|
||||
os_offset << offset << "s";
|
||||
os_frames << "+" << frames << "s";
|
||||
effects.emplace_back(
|
||||
std::vector<std::string>{"trim", os_offset.str(), os_frames.str()});
|
||||
} else if (offset != 0) {
|
||||
std::ostringstream os_offset;
|
||||
os_offset << offset << "s";
|
||||
effects.emplace_back(std::vector<std::string>{"trim", os_offset.str()});
|
||||
}
|
||||
return effects;
|
||||
}
|
||||
|
||||
auto get_info_fileobj(py::object fileobj,
|
||||
const tl::optional<std::string> &format)
|
||||
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string> {
|
||||
const auto capacity = [&]() {
|
||||
const auto bufsiz = get_buffer_size();
|
||||
const int64_t kDefaultCapacityInBytes = 4096;
|
||||
return (bufsiz > kDefaultCapacityInBytes) ? bufsiz
|
||||
: kDefaultCapacityInBytes;
|
||||
}();
|
||||
std::string buffer(capacity, '\0');
|
||||
auto *buf = const_cast<char *>(buffer.data());
|
||||
auto num_read = read_fileobj(&fileobj, capacity, buf);
|
||||
// If the file is shorter than 256, then libsox cannot read the header.
|
||||
auto buf_size = (num_read > 256) ? num_read : 256;
|
||||
|
||||
SoxFormat sf(sox_open_mem_read(
|
||||
buf,
|
||||
buf_size,
|
||||
/*signal=*/nullptr,
|
||||
/*encoding=*/nullptr,
|
||||
/*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
|
||||
|
||||
// In case of streamed data, length can be 0
|
||||
validate_input_memfile(sf);
|
||||
|
||||
return std::make_tuple(
|
||||
static_cast<int64_t>(sf->signal.rate),
|
||||
static_cast<int64_t>(sf->signal.length / sf->signal.channels),
|
||||
static_cast<int64_t>(sf->signal.channels),
|
||||
static_cast<int64_t>(sf->encoding.bits_per_sample),
|
||||
get_encoding(sf->encoding.encoding));
|
||||
}
|
||||
|
||||
tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
|
||||
py::object fileobj,
|
||||
const tl::optional<int64_t>& frame_offset,
|
||||
const tl::optional<int64_t>& num_frames,
|
||||
tl::optional<bool> normalize,
|
||||
tl::optional<bool> channels_first,
|
||||
const tl::optional<std::string>& format) {
|
||||
auto effects = get_effects(frame_offset, num_frames);
|
||||
return paddleaudio::sox_effects::apply_effects_fileobj(
|
||||
std::move(fileobj), effects, normalize, channels_first, std::move(format));
|
||||
}
|
||||
|
||||
tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
|
||||
const std::string& path,
|
||||
const tl::optional<int64_t>& frame_offset,
|
||||
const tl::optional<int64_t>& num_frames,
|
||||
tl::optional<bool> normalize,
|
||||
tl::optional<bool> channels_first,
|
||||
const tl::optional<std::string>& format) {
|
||||
auto effects = get_effects(frame_offset, num_frames);
|
||||
return paddleaudio::sox_effects::apply_effects_file(
|
||||
path, effects, normalize, channels_first, format);
|
||||
}
|
||||
|
||||
void save_audio_file(const std::string& path,
|
||||
py::array tensor,
|
||||
int64_t sample_rate,
|
||||
bool channels_first,
|
||||
tl::optional<double> compression,
|
||||
tl::optional<std::string> format,
|
||||
tl::optional<std::string> encoding,
|
||||
tl::optional<int64_t> bits_per_sample) {
|
||||
validate_input_tensor(tensor);
|
||||
|
||||
const auto filetype = [&]() {
|
||||
if (format.has_value()) return format.value();
|
||||
return get_filetype(path);
|
||||
}();
|
||||
|
||||
if (filetype == "amr-nb") {
|
||||
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||
//TORCH_CHECK(num_channels == 1,
|
||||
// "amr-nb format only supports single channel audio.");
|
||||
assert(num_channels == 1);
|
||||
} else if (filetype == "htk") {
|
||||
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||
// TORCH_CHECK(num_channels == 1,
|
||||
// "htk format only supports single channel audio.");
|
||||
assert(num_channels == 1);
|
||||
} else if (filetype == "gsm") {
|
||||
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||
assert(num_channels == 1);
|
||||
assert(sample_rate == 8000);
|
||||
//TORCH_CHECK(num_channels == 1,
|
||||
// "gsm format only supports single channel audio.");
|
||||
//TORCH_CHECK(sample_rate == 8000,
|
||||
// "gsm format only supports a sampling rate of 8kHz.");
|
||||
}
|
||||
const auto signal_info =
|
||||
get_signalinfo(&tensor, sample_rate, filetype, channels_first);
|
||||
const auto encoding_info = get_encodinginfo_for_save(
|
||||
filetype, tensor.dtype(), compression, encoding, bits_per_sample);
|
||||
|
||||
SoxFormat sf(sox_open_write(path.c_str(),
|
||||
&signal_info,
|
||||
&encoding_info,
|
||||
/*filetype=*/filetype.c_str(),
|
||||
/*oob=*/nullptr,
|
||||
/*overwrite_permitted=*/nullptr));
|
||||
|
||||
if (static_cast<sox_format_t*>(sf) == nullptr) {
|
||||
throw std::runtime_error(
|
||||
"Error saving audio file: failed to open file " + path);
|
||||
}
|
||||
|
||||
paddleaudio::sox_effects_chain::SoxEffectsChain chain(
|
||||
/*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
|
||||
/*output_encoding=*/sf->encoding);
|
||||
chain.addInputTensor(&tensor, sample_rate, channels_first);
|
||||
chain.addOutputFile(sf);
|
||||
chain.run();
|
||||
}
|
||||
|
||||
namespace {
|
||||
// helper class to automatically release buffer, to be used by
|
||||
// save_audio_fileobj
|
||||
struct AutoReleaseBuffer {
|
||||
char* ptr;
|
||||
size_t size;
|
||||
|
||||
AutoReleaseBuffer() : ptr(nullptr), size(0) {}
|
||||
AutoReleaseBuffer(const AutoReleaseBuffer& other) = delete;
|
||||
AutoReleaseBuffer(AutoReleaseBuffer&& other) = delete;
|
||||
auto operator=(const AutoReleaseBuffer& other) -> AutoReleaseBuffer& = delete;
|
||||
auto operator=(AutoReleaseBuffer&& other) -> AutoReleaseBuffer& = delete;
|
||||
~AutoReleaseBuffer() {
|
||||
if (ptr) {
|
||||
free(ptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
void save_audio_fileobj(
|
||||
py::object fileobj,
|
||||
py::array tensor,
|
||||
int64_t sample_rate,
|
||||
bool channels_first,
|
||||
tl::optional<double> compression,
|
||||
tl::optional<std::string> format,
|
||||
tl::optional<std::string> encoding,
|
||||
tl::optional<int64_t> bits_per_sample) {
|
||||
|
||||
if (!format.has_value()) {
|
||||
throw std::runtime_error(
|
||||
"`format` is required when saving to file object.");
|
||||
}
|
||||
const auto filetype = format.value();
|
||||
|
||||
if (filetype == "amr-nb") {
|
||||
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||
if (num_channels != 1) {
|
||||
throw std::runtime_error(
|
||||
"amr-nb format only supports single channel audio.");
|
||||
}
|
||||
} else if (filetype == "htk") {
|
||||
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||
if (num_channels != 1) {
|
||||
throw std::runtime_error(
|
||||
"htk format only supports single channel audio.");
|
||||
}
|
||||
} else if (filetype == "gsm") {
|
||||
const auto num_channels = tensor.shape(channels_first ? 0 : 1);
|
||||
if (num_channels != 1) {
|
||||
throw std::runtime_error(
|
||||
"gsm format only supports single channel audio.");
|
||||
}
|
||||
if (sample_rate != 8000) {
|
||||
throw std::runtime_error(
|
||||
"gsm format only supports a sampling rate of 8kHz.");
|
||||
}
|
||||
}
|
||||
|
||||
const auto signal_info =
|
||||
get_signalinfo(&tensor, sample_rate, filetype, channels_first);
|
||||
const auto encoding_info = get_encodinginfo_for_save(
|
||||
filetype,
|
||||
tensor.dtype(),
|
||||
compression,
|
||||
std::move(encoding),
|
||||
bits_per_sample);
|
||||
|
||||
AutoReleaseBuffer buffer;
|
||||
|
||||
SoxFormat sf(sox_open_memstream_write(
|
||||
&buffer.ptr,
|
||||
&buffer.size,
|
||||
&signal_info,
|
||||
&encoding_info,
|
||||
filetype.c_str(),
|
||||
/*oob=*/nullptr));
|
||||
|
||||
if (static_cast<sox_format_t*>(sf) == nullptr) {
|
||||
throw std::runtime_error(
|
||||
"Error saving audio file: failed to open memory stream.");
|
||||
}
|
||||
|
||||
paddleaudio::sox_effects_chain::SoxEffectsChainPyBind chain(
|
||||
/*input_encoding=*/get_tensor_encodinginfo(tensor.dtype()),
|
||||
/*output_encoding=*/sf->encoding);
|
||||
chain.addInputTensor(&tensor, sample_rate, channels_first);
|
||||
chain.addOutputFileObj(sf, &buffer.ptr, &buffer.size, &fileobj);
|
||||
chain.run();
|
||||
|
||||
// Closing the sox_format_t is necessary for flushing the last chunk to the
|
||||
// buffer
|
||||
sf.close();
|
||||
fileobj.attr("write")(py::bytes(buffer.ptr, buffer.size));
|
||||
}
|
||||
|
||||
} // namespace paddleaudio
|
||||
} // namespace sox_io
|
@ -1,63 +0,0 @@
|
||||
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
|
||||
// All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/sox/utils.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace sox_io {
|
||||
|
||||
auto get_info_file(const std::string &path,
|
||||
const tl::optional<std::string> &format)
|
||||
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
|
||||
|
||||
auto get_info_fileobj(py::object fileobj,
|
||||
const tl::optional<std::string> &format)
|
||||
-> std::tuple<int64_t, int64_t, int64_t, int64_t, std::string>;
|
||||
|
||||
tl::optional<std::tuple<py::array, int64_t>> load_audio_fileobj(
|
||||
py::object fileobj,
|
||||
const tl::optional<int64_t>& frame_offset,
|
||||
const tl::optional<int64_t>& num_frames,
|
||||
tl::optional<bool> normalize,
|
||||
tl::optional<bool> channels_first,
|
||||
const tl::optional<std::string>& format);
|
||||
|
||||
void save_audio_fileobj(
|
||||
py::object fileobj,
|
||||
py::array tensor,
|
||||
int64_t sample_rate,
|
||||
bool channels_first,
|
||||
tl::optional<double> compression,
|
||||
tl::optional<std::string> format,
|
||||
tl::optional<std::string> encoding,
|
||||
tl::optional<int64_t> bits_per_sample);
|
||||
|
||||
auto get_effects(const tl::optional<int64_t>& frame_offset,
|
||||
const tl::optional<int64_t>& num_frames)
|
||||
-> std::vector<std::vector<std::string>>;
|
||||
|
||||
|
||||
tl::optional<std::tuple<py::array, int64_t>> load_audio_file(
|
||||
const std::string& path,
|
||||
const tl::optional<int64_t>& frame_offset,
|
||||
const tl::optional<int64_t>& num_frames,
|
||||
tl::optional<bool> normalize,
|
||||
tl::optional<bool> channels_first,
|
||||
const tl::optional<std::string>& format);
|
||||
|
||||
void save_audio_file(const std::string& path,
|
||||
py::array tensor,
|
||||
int64_t sample_rate,
|
||||
bool channels_first,
|
||||
tl::optional<double> compression,
|
||||
tl::optional<std::string> format,
|
||||
tl::optional<std::string> encoding,
|
||||
tl::optional<int64_t> bits_per_sample);
|
||||
|
||||
|
||||
} // namespace paddleaudio
|
||||
} // namespace sox_io
|
@ -1,143 +0,0 @@
|
||||
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.cpp
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/sox/types.h"
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace sox_utils {
|
||||
|
||||
Format get_format_from_string(const std::string& format) {
|
||||
if (format == "wav")
|
||||
return Format::WAV;
|
||||
if (format == "mp3")
|
||||
return Format::MP3;
|
||||
if (format == "flac")
|
||||
return Format::FLAC;
|
||||
if (format == "ogg" || format == "vorbis")
|
||||
return Format::VORBIS;
|
||||
if (format == "amr-nb")
|
||||
return Format::AMR_NB;
|
||||
if (format == "amr-wb")
|
||||
return Format::AMR_WB;
|
||||
if (format == "amb")
|
||||
return Format::AMB;
|
||||
if (format == "sph")
|
||||
return Format::SPHERE;
|
||||
if (format == "htk")
|
||||
return Format::HTK;
|
||||
if (format == "gsm")
|
||||
return Format::GSM;
|
||||
std::ostringstream stream;
|
||||
stream << "Internal Error: unexpected format value: " << format;
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
|
||||
std::string to_string(Encoding v) {
|
||||
switch (v) {
|
||||
case Encoding::UNKNOWN:
|
||||
return "UNKNOWN";
|
||||
case Encoding::PCM_SIGNED:
|
||||
return "PCM_S";
|
||||
case Encoding::PCM_UNSIGNED:
|
||||
return "PCM_U";
|
||||
case Encoding::PCM_FLOAT:
|
||||
return "PCM_F";
|
||||
case Encoding::FLAC:
|
||||
return "FLAC";
|
||||
case Encoding::ULAW:
|
||||
return "ULAW";
|
||||
case Encoding::ALAW:
|
||||
return "ALAW";
|
||||
case Encoding::MP3:
|
||||
return "MP3";
|
||||
case Encoding::VORBIS:
|
||||
return "VORBIS";
|
||||
case Encoding::AMR_WB:
|
||||
return "AMR_WB";
|
||||
case Encoding::AMR_NB:
|
||||
return "AMR_NB";
|
||||
case Encoding::OPUS:
|
||||
return "OPUS";
|
||||
default:
|
||||
throw std::runtime_error("Internal Error: unexpected encoding.");
|
||||
}
|
||||
}
|
||||
|
||||
Encoding get_encoding_from_option(const tl::optional<std::string> encoding) {
|
||||
if (!encoding.has_value())
|
||||
return Encoding::NOT_PROVIDED;
|
||||
std::string v = encoding.value();
|
||||
if (v == "PCM_S")
|
||||
return Encoding::PCM_SIGNED;
|
||||
if (v == "PCM_U")
|
||||
return Encoding::PCM_UNSIGNED;
|
||||
if (v == "PCM_F")
|
||||
return Encoding::PCM_FLOAT;
|
||||
if (v == "ULAW")
|
||||
return Encoding::ULAW;
|
||||
if (v == "ALAW")
|
||||
return Encoding::ALAW;
|
||||
std::ostringstream stream;
|
||||
stream << "Internal Error: unexpected encoding value: " << v;
|
||||
throw std::runtime_error(stream.str());
|
||||
}
|
||||
|
||||
BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth) {
|
||||
if (!bit_depth.has_value())
|
||||
return BitDepth::NOT_PROVIDED;
|
||||
int64_t v = bit_depth.value();
|
||||
switch (v) {
|
||||
case 8:
|
||||
return BitDepth::B8;
|
||||
case 16:
|
||||
return BitDepth::B16;
|
||||
case 24:
|
||||
return BitDepth::B24;
|
||||
case 32:
|
||||
return BitDepth::B32;
|
||||
case 64:
|
||||
return BitDepth::B64;
|
||||
default: {
|
||||
std::ostringstream s;
|
||||
s << "Internal Error: unexpected bit depth value: " << v;
|
||||
throw std::runtime_error(s.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string get_encoding(sox_encoding_t encoding) {
|
||||
switch (encoding) {
|
||||
case SOX_ENCODING_UNKNOWN:
|
||||
return "UNKNOWN";
|
||||
case SOX_ENCODING_SIGN2:
|
||||
return "PCM_S";
|
||||
case SOX_ENCODING_UNSIGNED:
|
||||
return "PCM_U";
|
||||
case SOX_ENCODING_FLOAT:
|
||||
return "PCM_F";
|
||||
case SOX_ENCODING_FLAC:
|
||||
return "FLAC";
|
||||
case SOX_ENCODING_ULAW:
|
||||
return "ULAW";
|
||||
case SOX_ENCODING_ALAW:
|
||||
return "ALAW";
|
||||
case SOX_ENCODING_MP3:
|
||||
return "MP3";
|
||||
case SOX_ENCODING_VORBIS:
|
||||
return "VORBIS";
|
||||
case SOX_ENCODING_AMR_WB:
|
||||
return "AMR_WB";
|
||||
case SOX_ENCODING_AMR_NB:
|
||||
return "AMR_NB";
|
||||
case SOX_ENCODING_OPUS:
|
||||
return "OPUS";
|
||||
case SOX_ENCODING_GSM:
|
||||
return "GSM";
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace sox_utils
|
||||
} // namespace paddleaudio
|
@ -1,58 +0,0 @@
|
||||
//code is from: https://github.com/pytorch/audio/blob/main/torchaudio/csrc/sox/types.h
|
||||
#pragma once
|
||||
|
||||
#include <sox.h>
|
||||
#include "paddlespeech/audio/src/optional/optional.hpp"
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace sox_utils {
|
||||
|
||||
enum class Format {
|
||||
WAV,
|
||||
MP3,
|
||||
FLAC,
|
||||
VORBIS,
|
||||
AMR_NB,
|
||||
AMR_WB,
|
||||
AMB,
|
||||
SPHERE,
|
||||
GSM,
|
||||
HTK,
|
||||
};
|
||||
|
||||
Format get_format_from_string(const std::string& format);
|
||||
|
||||
enum class Encoding {
|
||||
NOT_PROVIDED,
|
||||
UNKNOWN,
|
||||
PCM_SIGNED,
|
||||
PCM_UNSIGNED,
|
||||
PCM_FLOAT,
|
||||
FLAC,
|
||||
ULAW,
|
||||
ALAW,
|
||||
MP3,
|
||||
VORBIS,
|
||||
AMR_WB,
|
||||
AMR_NB,
|
||||
OPUS,
|
||||
};
|
||||
|
||||
std::string to_string(Encoding v);
|
||||
Encoding get_encoding_from_option(const tl::optional<std::string> encoding);
|
||||
|
||||
enum class BitDepth : unsigned {
|
||||
NOT_PROVIDED = 0,
|
||||
B8 = 8,
|
||||
B16 = 16,
|
||||
B24 = 24,
|
||||
B32 = 32,
|
||||
B64 = 64,
|
||||
};
|
||||
|
||||
BitDepth get_bit_depth_from_option(const tl::optional<int64_t> bit_depth);
|
||||
|
||||
std::string get_encoding(sox_encoding_t encoding);
|
||||
|
||||
} // namespace sox_utils
|
||||
} // namespace paddleaudio
|
@ -1,642 +0,0 @@
|
||||
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
|
||||
// All rights reserved.
|
||||
#include <sox.h>
|
||||
|
||||
#include "paddlespeech/audio/src/pybind/sox/utils.h"
|
||||
#include "paddlespeech/audio/src/pybind/sox/types.h"
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace sox_utils {
|
||||
|
||||
auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
|
||||
-> uint64_t {
|
||||
uint64_t num_read = 0;
|
||||
while (num_read < size) {
|
||||
auto request = size - num_read;
|
||||
auto chunk = static_cast<std::string>(
|
||||
static_cast<py::bytes>(fileobj->attr("read")(request)));
|
||||
auto chunk_len = chunk.length();
|
||||
if (chunk_len == 0) {
|
||||
break;
|
||||
}
|
||||
if (chunk_len > request) {
|
||||
std::ostringstream message;
|
||||
message
|
||||
<< "Requested up to " << request << " bytes but, "
|
||||
<< "received " << chunk_len << " bytes. "
|
||||
<< "The given object does not confirm to read protocol of file "
|
||||
"object.";
|
||||
throw std::runtime_error(message.str());
|
||||
}
|
||||
memcpy(buffer, chunk.data(), chunk_len);
|
||||
buffer += chunk_len;
|
||||
num_read += chunk_len;
|
||||
}
|
||||
return num_read;
|
||||
}
|
||||
|
||||
|
||||
void set_seed(const int64_t seed) {
|
||||
sox_get_globals()->ranqd1 = static_cast<sox_int32_t>(seed);
|
||||
}
|
||||
|
||||
void set_verbosity(const int64_t verbosity) {
|
||||
sox_get_globals()->verbosity = static_cast<unsigned>(verbosity);
|
||||
}
|
||||
|
||||
void set_use_threads(const bool use_threads) {
|
||||
sox_get_globals()->use_threads = static_cast<sox_bool>(use_threads);
|
||||
}
|
||||
|
||||
void set_buffer_size(const int64_t buffer_size) {
|
||||
sox_get_globals()->bufsiz = static_cast<size_t>(buffer_size);
|
||||
}
|
||||
|
||||
int64_t get_buffer_size() {
|
||||
return sox_get_globals()->bufsiz;
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::string>> list_effects() {
|
||||
std::vector<std::vector<std::string>> effects;
|
||||
for (const sox_effect_fn_t* fns = sox_get_effect_fns(); *fns; ++fns) {
|
||||
const sox_effect_handler_t* handler = (*fns)();
|
||||
if (handler && handler->name) {
|
||||
if (UNSUPPORTED_EFFECTS.find(handler->name) ==
|
||||
UNSUPPORTED_EFFECTS.end()) {
|
||||
effects.emplace_back(std::vector<std::string>{
|
||||
handler->name,
|
||||
handler->usage ? std::string(handler->usage) : std::string("")});
|
||||
}
|
||||
}
|
||||
}
|
||||
return effects;
|
||||
}
|
||||
|
||||
std::vector<std::string> list_write_formats() {
|
||||
std::vector<std::string> formats;
|
||||
for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
|
||||
const sox_format_handler_t* handler = fns->fn();
|
||||
for (const char* const* names = handler->names; *names; ++names) {
|
||||
if (!strchr(*names, '/') && handler->write)
|
||||
formats.emplace_back(*names);
|
||||
}
|
||||
}
|
||||
return formats;
|
||||
}
|
||||
|
||||
std::vector<std::string> list_read_formats() {
|
||||
std::vector<std::string> formats;
|
||||
for (const sox_format_tab_t* fns = sox_get_format_fns(); fns->fn; ++fns) {
|
||||
const sox_format_handler_t* handler = fns->fn();
|
||||
for (const char* const* names = handler->names; *names; ++names) {
|
||||
if (!strchr(*names, '/') && handler->read)
|
||||
formats.emplace_back(*names);
|
||||
}
|
||||
}
|
||||
return formats;
|
||||
}
|
||||
|
||||
SoxFormat::SoxFormat(sox_format_t* fd) noexcept : fd_(fd) {}
|
||||
SoxFormat::~SoxFormat() {
|
||||
close();
|
||||
}
|
||||
|
||||
sox_format_t* SoxFormat::operator->() const noexcept {
|
||||
return fd_;
|
||||
}
|
||||
SoxFormat::operator sox_format_t*() const noexcept {
|
||||
return fd_;
|
||||
}
|
||||
|
||||
void SoxFormat::close() {
|
||||
if (fd_ != nullptr) {
|
||||
sox_close(fd_);
|
||||
fd_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void validate_input_file(const SoxFormat& sf, const std::string& path) {
|
||||
if (static_cast<sox_format_t*>(sf) == nullptr) {
|
||||
throw std::runtime_error(
|
||||
"Error loading audio file: failed to open file " + path);
|
||||
}
|
||||
if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
|
||||
throw std::runtime_error("Error loading audio file: unknown encoding.");
|
||||
}
|
||||
}
|
||||
|
||||
void validate_input_memfile(const SoxFormat &sf) {
|
||||
return validate_input_file(sf, "<in memory buffer>");
|
||||
}
|
||||
|
||||
void validate_input_tensor(const py::array tensor) {
|
||||
if (tensor.ndim() != 2) {
|
||||
throw std::runtime_error("Input tensor has to be 2D.");
|
||||
}
|
||||
|
||||
char dtype = tensor.dtype().char_();
|
||||
bool flag = (dtype == 'f') || (dtype == 'd') || (dtype == 'l') || (dtype == 'i');
|
||||
if (flag == false) {
|
||||
throw std::runtime_error(
|
||||
"Input tensor has to be one of float32, int32, int16 or uint8 type.");
|
||||
}
|
||||
}
|
||||
|
||||
py::dtype get_dtype(
|
||||
const sox_encoding_t encoding,
|
||||
const unsigned precision) {
|
||||
switch (encoding) {
|
||||
case SOX_ENCODING_UNSIGNED: // 8-bit PCM WAV
|
||||
return py::dtype('u1');
|
||||
case SOX_ENCODING_SIGN2: // 16-bit, 24-bit, or 32-bit PCM WAV
|
||||
switch (precision) {
|
||||
case 16:
|
||||
return py::dtype("i2");
|
||||
case 24: // Cast 24-bit to 32-bit.
|
||||
case 32:
|
||||
return py::dtype('i');
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"Only 16, 24, and 32 bits are supported for signed PCM.");
|
||||
}
|
||||
default:
|
||||
// default to float32 for the other formats, including
|
||||
// 32-bit flaoting-point WAV,
|
||||
// MP3,
|
||||
// FLAC,
|
||||
// VORBIS etc...
|
||||
return py::dtype("f");
|
||||
}
|
||||
}
|
||||
|
||||
py::array convert_to_tensor(
|
||||
sox_sample_t* buffer,
|
||||
const int32_t num_samples,
|
||||
const int32_t num_channels,
|
||||
const py::dtype dtype,
|
||||
const bool normalize,
|
||||
const bool channels_first) {
|
||||
// todo refector later(SGoat)
|
||||
py::array t;
|
||||
uint64_t dummy = 0;
|
||||
SOX_SAMPLE_LOCALS;
|
||||
int32_t num_rows = num_samples / num_channels;
|
||||
if (normalize || dtype.char_() == 'f') {
|
||||
t = py::array(dtype, {num_rows, num_channels});
|
||||
auto ptr = (float*)t.mutable_data(0, 0);
|
||||
for (int32_t i = 0; i < num_samples; ++i) {
|
||||
ptr[i] = SOX_SAMPLE_TO_FLOAT_32BIT(buffer[i], dummy);
|
||||
}
|
||||
if (channels_first) {
|
||||
py::array t2 = py::array(dtype, {num_channels, num_rows});
|
||||
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
|
||||
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
|
||||
*(float*)t2.mutable_data(row_idx, col_idx) = *(float*)t.data(col_idx, row_idx);
|
||||
}
|
||||
return t2;
|
||||
}
|
||||
} else if (dtype.char_() == 'i') {
|
||||
t = py::array(dtype, {num_rows, num_channels});
|
||||
auto ptr = (int*)t.mutable_data(0, 0);
|
||||
for (int32_t i = 0; i < num_samples; ++i) {
|
||||
ptr[i] = buffer[i];
|
||||
}
|
||||
if (channels_first) {
|
||||
py::array t2 = py::array(dtype, {num_channels, num_rows});
|
||||
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
|
||||
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
|
||||
*(int*)t2.mutable_data(row_idx, col_idx) = *(int*)t.data(col_idx, row_idx);
|
||||
}
|
||||
return t2;
|
||||
}
|
||||
} else if (dtype.char_() == 'h') { // int16
|
||||
t = py::array(dtype, {num_rows, num_channels});
|
||||
auto ptr = (int16_t*)t.mutable_data(0, 0);
|
||||
for (int32_t i = 0; i < num_samples; ++i) {
|
||||
ptr[i] = SOX_SAMPLE_TO_SIGNED_16BIT(buffer[i], dummy);
|
||||
}
|
||||
if (channels_first) {
|
||||
py::array t2 = py::array(dtype, {num_channels, num_rows});
|
||||
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
|
||||
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
|
||||
*(int16_t*)t2.mutable_data(row_idx, col_idx) = *(int16_t*)t.data(col_idx, row_idx);
|
||||
}
|
||||
return t2;
|
||||
}
|
||||
} else if (dtype.char_() == 'b') {
|
||||
//t = torch::empty({num_samples / num_channels, num_channels}, torch::kUInt8);
|
||||
t = py::array(dtype, {num_rows, num_channels});
|
||||
auto ptr = (uint8_t*)t.mutable_data(0,0);
|
||||
for (int32_t i = 0; i < num_samples; ++i) {
|
||||
ptr[i] = SOX_SAMPLE_TO_UNSIGNED_8BIT(buffer[i], dummy);
|
||||
}
|
||||
if (channels_first) {
|
||||
py::array t2 = py::array(dtype, {num_channels, num_rows});
|
||||
for (int32_t row_idx = 0; row_idx < num_channels; ++row_idx) {
|
||||
for (int32_t col_idx = 0; col_idx < num_rows; ++col_idx)
|
||||
*(uint8_t*)t2.mutable_data(row_idx, col_idx) = *(uint8_t*)t.data(col_idx, row_idx);
|
||||
}
|
||||
return t2;
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("Unsupported dtype.");
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
const std::string get_filetype(const std::string path) {
|
||||
std::string ext = path.substr(path.find_last_of(".") + 1);
|
||||
std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
|
||||
return ext;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
std::tuple<sox_encoding_t, unsigned> get_save_encoding_for_wav(
|
||||
const std::string format,
|
||||
py::dtype dtype,
|
||||
const Encoding& encoding,
|
||||
const BitDepth& bits_per_sample) {
|
||||
switch (encoding) {
|
||||
case Encoding::NOT_PROVIDED:
|
||||
switch (bits_per_sample) {
|
||||
case BitDepth::NOT_PROVIDED:
|
||||
switch (dtype.num()) {
|
||||
case 11: // float32 numpy dtype num
|
||||
return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
|
||||
case 5: // int numpy dtype num
|
||||
return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
|
||||
case 3: // int16 numpy
|
||||
return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
|
||||
case 1: // byte numpy
|
||||
return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
|
||||
default:
|
||||
throw std::runtime_error("Internal Error: Unexpected dtype.");
|
||||
}
|
||||
case BitDepth::B8:
|
||||
return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
|
||||
default:
|
||||
return std::make_tuple<>(
|
||||
SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
|
||||
}
|
||||
case Encoding::PCM_SIGNED:
|
||||
switch (bits_per_sample) {
|
||||
case BitDepth::NOT_PROVIDED:
|
||||
return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
|
||||
case BitDepth::B8:
|
||||
throw std::runtime_error(
|
||||
format + " does not support 8-bit signed PCM encoding.");
|
||||
default:
|
||||
return std::make_tuple<>(
|
||||
SOX_ENCODING_SIGN2, static_cast<unsigned>(bits_per_sample));
|
||||
}
|
||||
case Encoding::PCM_UNSIGNED:
|
||||
switch (bits_per_sample) {
|
||||
case BitDepth::NOT_PROVIDED:
|
||||
case BitDepth::B8:
|
||||
return std::make_tuple<>(SOX_ENCODING_UNSIGNED, 8);
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
format + " only supports 8-bit for unsigned PCM encoding.");
|
||||
}
|
||||
case Encoding::PCM_FLOAT:
|
||||
switch (bits_per_sample) {
|
||||
case BitDepth::NOT_PROVIDED:
|
||||
case BitDepth::B32:
|
||||
return std::make_tuple<>(SOX_ENCODING_FLOAT, 32);
|
||||
case BitDepth::B64:
|
||||
return std::make_tuple<>(SOX_ENCODING_FLOAT, 64);
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
format +
|
||||
" only supports 32-bit or 64-bit for floating-point PCM encoding.");
|
||||
}
|
||||
case Encoding::ULAW:
|
||||
switch (bits_per_sample) {
|
||||
case BitDepth::NOT_PROVIDED:
|
||||
case BitDepth::B8:
|
||||
return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
format + " only supports 8-bit for mu-law encoding.");
|
||||
}
|
||||
case Encoding::ALAW:
|
||||
switch (bits_per_sample) {
|
||||
case BitDepth::NOT_PROVIDED:
|
||||
case BitDepth::B8:
|
||||
return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
format + " only supports 8-bit for a-law encoding.");
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
format + " does not support encoding: " + to_string(encoding));
|
||||
}
|
||||
}
|
||||
|
||||
std::tuple<sox_encoding_t, unsigned> get_save_encoding(
|
||||
const std::string& format,
|
||||
const py::dtype dtype,
|
||||
const tl::optional<std::string> encoding,
|
||||
const tl::optional<int64_t> bits_per_sample) {
|
||||
const Format fmt = get_format_from_string(format);
|
||||
const Encoding enc = get_encoding_from_option(encoding);
|
||||
const BitDepth bps = get_bit_depth_from_option(bits_per_sample);
|
||||
|
||||
switch (fmt) {
|
||||
case Format::WAV:
|
||||
case Format::AMB:
|
||||
return get_save_encoding_for_wav(format, dtype, enc, bps);
|
||||
case Format::MP3:
|
||||
if (enc != Encoding::NOT_PROVIDED)
|
||||
throw std::runtime_error("mp3 does not support `encoding` option.");
|
||||
if (bps != BitDepth::NOT_PROVIDED)
|
||||
throw std::runtime_error(
|
||||
"mp3 does not support `bits_per_sample` option.");
|
||||
return std::make_tuple<>(SOX_ENCODING_MP3, 16);
|
||||
case Format::HTK:
|
||||
if (enc != Encoding::NOT_PROVIDED)
|
||||
throw std::runtime_error("htk does not support `encoding` option.");
|
||||
if (bps != BitDepth::NOT_PROVIDED)
|
||||
throw std::runtime_error(
|
||||
"htk does not support `bits_per_sample` option.");
|
||||
return std::make_tuple<>(SOX_ENCODING_SIGN2, 16);
|
||||
case Format::VORBIS:
|
||||
if (enc != Encoding::NOT_PROVIDED)
|
||||
throw std::runtime_error("vorbis does not support `encoding` option.");
|
||||
if (bps != BitDepth::NOT_PROVIDED)
|
||||
throw std::runtime_error(
|
||||
"vorbis does not support `bits_per_sample` option.");
|
||||
return std::make_tuple<>(SOX_ENCODING_VORBIS, 16);
|
||||
case Format::AMR_NB:
|
||||
if (enc != Encoding::NOT_PROVIDED)
|
||||
throw std::runtime_error("amr-nb does not support `encoding` option.");
|
||||
if (bps != BitDepth::NOT_PROVIDED)
|
||||
throw std::runtime_error(
|
||||
"amr-nb does not support `bits_per_sample` option.");
|
||||
return std::make_tuple<>(SOX_ENCODING_AMR_NB, 16);
|
||||
case Format::FLAC:
|
||||
if (enc != Encoding::NOT_PROVIDED)
|
||||
throw std::runtime_error("flac does not support `encoding` option.");
|
||||
switch (bps) {
|
||||
case BitDepth::B32:
|
||||
case BitDepth::B64:
|
||||
throw std::runtime_error(
|
||||
"flac does not support `bits_per_sample` larger than 24.");
|
||||
default:
|
||||
return std::make_tuple<>(
|
||||
SOX_ENCODING_FLAC, static_cast<unsigned>(bps));
|
||||
}
|
||||
case Format::SPHERE:
|
||||
switch (enc) {
|
||||
case Encoding::NOT_PROVIDED:
|
||||
case Encoding::PCM_SIGNED:
|
||||
switch (bps) {
|
||||
case BitDepth::NOT_PROVIDED:
|
||||
return std::make_tuple<>(SOX_ENCODING_SIGN2, 32);
|
||||
default:
|
||||
return std::make_tuple<>(
|
||||
SOX_ENCODING_SIGN2, static_cast<unsigned>(bps));
|
||||
}
|
||||
case Encoding::PCM_UNSIGNED:
|
||||
throw std::runtime_error(
|
||||
"sph does not support unsigned integer PCM.");
|
||||
case Encoding::PCM_FLOAT:
|
||||
throw std::runtime_error("sph does not support floating point PCM.");
|
||||
case Encoding::ULAW:
|
||||
switch (bps) {
|
||||
case BitDepth::NOT_PROVIDED:
|
||||
case BitDepth::B8:
|
||||
return std::make_tuple<>(SOX_ENCODING_ULAW, 8);
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"sph only supports 8-bit for mu-law encoding.");
|
||||
}
|
||||
case Encoding::ALAW:
|
||||
switch (bps) {
|
||||
case BitDepth::NOT_PROVIDED:
|
||||
case BitDepth::B8:
|
||||
return std::make_tuple<>(SOX_ENCODING_ALAW, 8);
|
||||
default:
|
||||
return std::make_tuple<>(
|
||||
SOX_ENCODING_ALAW, static_cast<unsigned>(bps));
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"sph does not support encoding: " + encoding.value());
|
||||
}
|
||||
case Format::GSM:
|
||||
if (enc != Encoding::NOT_PROVIDED)
|
||||
throw std::runtime_error("gsm does not support `encoding` option.");
|
||||
if (bps != BitDepth::NOT_PROVIDED)
|
||||
throw std::runtime_error(
|
||||
"gsm does not support `bits_per_sample` option.");
|
||||
return std::make_tuple<>(SOX_ENCODING_GSM, 16);
|
||||
|
||||
default:
|
||||
throw std::runtime_error("Unsupported format: " + format);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned get_precision(const std::string filetype, py::dtype dtype) {
|
||||
if (filetype == "mp3")
|
||||
return SOX_UNSPEC;
|
||||
if (filetype == "flac")
|
||||
return 24;
|
||||
if (filetype == "ogg" || filetype == "vorbis")
|
||||
return SOX_UNSPEC;
|
||||
if (filetype == "wav" || filetype == "amb") {
|
||||
switch (dtype.num()) {
|
||||
case 1: // byte in numpy dype num
|
||||
return 8;
|
||||
case 3: // short, in numpy dtype num
|
||||
return 16;
|
||||
case 5: // int, numpy dtype
|
||||
return 32;
|
||||
case 11: // float, numpy dtype
|
||||
return 32;
|
||||
default:
|
||||
throw std::runtime_error("Unsupported dtype.");
|
||||
}
|
||||
}
|
||||
if (filetype == "sph")
|
||||
return 32;
|
||||
if (filetype == "amr-nb") {
|
||||
return 16;
|
||||
}
|
||||
if (filetype == "gsm") {
|
||||
return 16;
|
||||
}
|
||||
if (filetype == "htk") {
|
||||
return 16;
|
||||
}
|
||||
throw std::runtime_error("Unsupported file type: " + filetype);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
sox_signalinfo_t get_signalinfo(
|
||||
const py::array* waveform,
|
||||
const int64_t sample_rate,
|
||||
const std::string filetype,
|
||||
const bool channels_first) {
|
||||
return sox_signalinfo_t{
|
||||
/*rate=*/static_cast<sox_rate_t>(sample_rate),
|
||||
/*channels=*/
|
||||
static_cast<unsigned>(waveform->shape(channels_first ? 0 : 1)),
|
||||
/*precision=*/get_precision(filetype, waveform->dtype()),
|
||||
/*length=*/static_cast<uint64_t>(waveform->size())};
|
||||
}
|
||||
|
||||
sox_encodinginfo_t get_tensor_encodinginfo(py::dtype dtype) {
|
||||
sox_encoding_t encoding = [&]() {
|
||||
switch (dtype.num()) {
|
||||
case 1: // byte
|
||||
return SOX_ENCODING_UNSIGNED;
|
||||
case 3: // short
|
||||
return SOX_ENCODING_SIGN2;
|
||||
case 5: // int32
|
||||
return SOX_ENCODING_SIGN2;
|
||||
case 11: // float
|
||||
return SOX_ENCODING_FLOAT;
|
||||
default:
|
||||
throw std::runtime_error("Unsupported dtype.");
|
||||
}
|
||||
}();
|
||||
unsigned bits_per_sample = [&]() {
|
||||
switch (dtype.num()) {
|
||||
case 1: // byte
|
||||
return 8;
|
||||
case 3: //short
|
||||
return 16;
|
||||
case 5: // int32
|
||||
return 32;
|
||||
case 11: // float
|
||||
return 32;
|
||||
default:
|
||||
throw std::runtime_error("Unsupported dtype.");
|
||||
}
|
||||
}();
|
||||
return sox_encodinginfo_t{
|
||||
/*encoding=*/encoding,
|
||||
/*bits_per_sample=*/bits_per_sample,
|
||||
/*compression=*/HUGE_VAL,
|
||||
/*reverse_bytes=*/sox_option_default,
|
||||
/*reverse_nibbles=*/sox_option_default,
|
||||
/*reverse_bits=*/sox_option_default,
|
||||
/*opposite_endian=*/sox_false};
|
||||
}
|
||||
|
||||
sox_encodinginfo_t get_encodinginfo_for_save(
|
||||
const std::string& format,
|
||||
const py::dtype dtype,
|
||||
const tl::optional<double> compression,
|
||||
const tl::optional<std::string> encoding,
|
||||
const tl::optional<int64_t> bits_per_sample) {
|
||||
auto enc = get_save_encoding(format, dtype, encoding, bits_per_sample);
|
||||
return sox_encodinginfo_t{
|
||||
/*encoding=*/std::get<0>(enc),
|
||||
/*bits_per_sample=*/std::get<1>(enc),
|
||||
/*compression=*/compression.value_or(HUGE_VAL),
|
||||
/*reverse_bytes=*/sox_option_default,
|
||||
/*reverse_nibbles=*/sox_option_default,
|
||||
/*reverse_bits=*/sox_option_default,
|
||||
/*opposite_endian=*/sox_false};
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
SoxFormat::SoxFormat(sox_format_t *fd) noexcept : fd_(fd) {}
|
||||
SoxFormat::~SoxFormat() { close(); }
|
||||
|
||||
sox_format_t *SoxFormat::operator->() const noexcept { return fd_; }
|
||||
SoxFormat::operator sox_format_t *() const noexcept { return fd_; }
|
||||
|
||||
void SoxFormat::close() {
|
||||
if (fd_ != nullptr) {
|
||||
sox_close(fd_);
|
||||
fd_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
auto read_fileobj(py::object *fileobj, const uint64_t size, char *buffer)
|
||||
-> uint64_t {
|
||||
uint64_t num_read = 0;
|
||||
while (num_read < size) {
|
||||
auto request = size - num_read;
|
||||
auto chunk = static_cast<std::string>(
|
||||
static_cast<py::bytes>(fileobj->attr("read")(request)));
|
||||
auto chunk_len = chunk.length();
|
||||
if (chunk_len == 0) {
|
||||
break;
|
||||
}
|
||||
if (chunk_len > request) {
|
||||
std::ostringstream message;
|
||||
message
|
||||
<< "Requested up to " << request << " bytes but, "
|
||||
<< "received " << chunk_len << " bytes. "
|
||||
<< "The given object does not confirm to read protocol of file "
|
||||
"object.";
|
||||
throw std::runtime_error(message.str());
|
||||
}
|
||||
memcpy(buffer, chunk.data(), chunk_len);
|
||||
buffer += chunk_len;
|
||||
num_read += chunk_len;
|
||||
}
|
||||
return num_read;
|
||||
}
|
||||
|
||||
int64_t get_buffer_size() { return sox_get_globals()->bufsiz; }
|
||||
|
||||
void validate_input_file(const SoxFormat &sf, const std::string &path) {
|
||||
if (static_cast<sox_format_t *>(sf) == nullptr) {
|
||||
throw std::runtime_error(
|
||||
"Error loading audio file: failed to open file " + path);
|
||||
}
|
||||
if (sf->encoding.encoding == SOX_ENCODING_UNKNOWN) {
|
||||
throw std::runtime_error("Error loading audio file: unknown encoding.");
|
||||
}
|
||||
}
|
||||
|
||||
void validate_input_memfile(const SoxFormat &sf) {
|
||||
return validate_input_file(sf, "<in memory buffer>");
|
||||
}
|
||||
|
||||
std::string get_encoding(sox_encoding_t encoding) {
|
||||
switch (encoding) {
|
||||
case SOX_ENCODING_UNKNOWN:
|
||||
return "UNKNOWN";
|
||||
case SOX_ENCODING_SIGN2:
|
||||
return "PCM_S";
|
||||
case SOX_ENCODING_UNSIGNED:
|
||||
return "PCM_U";
|
||||
case SOX_ENCODING_FLOAT:
|
||||
return "PCM_F";
|
||||
case SOX_ENCODING_FLAC:
|
||||
return "FLAC";
|
||||
case SOX_ENCODING_ULAW:
|
||||
return "ULAW";
|
||||
case SOX_ENCODING_ALAW:
|
||||
return "ALAW";
|
||||
case SOX_ENCODING_MP3:
|
||||
return "MP3";
|
||||
case SOX_ENCODING_VORBIS:
|
||||
return "VORBIS";
|
||||
case SOX_ENCODING_AMR_WB:
|
||||
return "AMR_WB";
|
||||
case SOX_ENCODING_AMR_NB:
|
||||
return "AMR_NB";
|
||||
case SOX_ENCODING_OPUS:
|
||||
return "OPUS";
|
||||
case SOX_ENCODING_GSM:
|
||||
return "GSM";
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
*/
|
||||
} // namespace paddleaudio
|
||||
} // namespace sox_utils
|
@ -1,116 +0,0 @@
|
||||
// Copyright (c) 2017 Facebook Inc. (Soumith Chintala),
|
||||
// All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/numpy.h>
|
||||
#include <sox.h>
|
||||
#include "paddlespeech/audio/src/optional/optional.hpp"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace paddleaudio {
|
||||
namespace sox_utils {
|
||||
|
||||
auto read_fileobj(py::object *fileobj, uint64_t size, char *buffer) -> uint64_t;
|
||||
|
||||
void set_seed(const int64_t seed);
|
||||
|
||||
void set_verbosity(const int64_t verbosity);
|
||||
|
||||
void set_use_threads(const bool use_threads);
|
||||
|
||||
void set_buffer_size(const int64_t buffer_size);
|
||||
|
||||
int64_t get_buffer_size();
|
||||
|
||||
std::vector<std::vector<std::string>> list_effects();
|
||||
|
||||
std::vector<std::string> list_read_formats();
|
||||
|
||||
std::vector<std::string> list_write_formats();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Utilities for sox_io / sox_effects implementations
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const std::unordered_set<std::string> UNSUPPORTED_EFFECTS =
|
||||
{"input", "output", "spectrogram", "noiseprof", "noisered", "splice"};
|
||||
|
||||
/// helper class to automatically close sox_format_t*
|
||||
struct SoxFormat {
|
||||
explicit SoxFormat(sox_format_t* fd) noexcept;
|
||||
SoxFormat(const SoxFormat& other) = delete;
|
||||
SoxFormat(SoxFormat&& other) = delete;
|
||||
SoxFormat& operator=(const SoxFormat& other) = delete;
|
||||
SoxFormat& operator=(SoxFormat&& other) = delete;
|
||||
~SoxFormat();
|
||||
sox_format_t* operator->() const noexcept;
|
||||
operator sox_format_t*() const noexcept;
|
||||
|
||||
void close();
|
||||
|
||||
private:
|
||||
sox_format_t* fd_;
|
||||
};
|
||||
|
||||
///
|
||||
/// Verify that input Tensor is 2D, CPU and either uin8, int16, int32 or float32
|
||||
void validate_input_tensor(const py::array);
|
||||
|
||||
void validate_input_file(const SoxFormat& sf, const std::string& path);
|
||||
|
||||
void validate_input_memfile(const SoxFormat &sf);
|
||||
///
|
||||
/// Get target dtype for the given encoding and precision.
|
||||
py::dtype get_dtype(
|
||||
const sox_encoding_t encoding,
|
||||
const unsigned precision);
|
||||
|
||||
///
|
||||
/// Convert sox_sample_t buffer to uint8/int16/int32/float32 Tensor
|
||||
/// NOTE: This function might modify the values in the input buffer to
|
||||
/// reduce the number of memory copy.
|
||||
/// @param buffer Pointer to buffer that contains audio data.
|
||||
/// @param num_samples The number of samples to read.
|
||||
/// @param num_channels The number of channels. Used to reshape the resulting
|
||||
/// Tensor.
|
||||
/// @param dtype Target dtype. Determines the output dtype and value range in
|
||||
/// conjunction with normalization.
|
||||
/// @param noramlize Perform normalization. Only effective when dtype is not
|
||||
/// kFloat32. When effective, the output tensor is kFloat32 type and value range
|
||||
/// is [-1.0, 1.0]
|
||||
/// @param channels_first When True, output Tensor has shape of [num_channels,
|
||||
/// num_frames].
|
||||
py::array convert_to_tensor(
|
||||
sox_sample_t* buffer,
|
||||
const int32_t num_samples,
|
||||
const int32_t num_channels,
|
||||
const py::dtype dtype,
|
||||
const bool normalize,
|
||||
const bool channels_first);
|
||||
|
||||
/// Extract extension from file path
|
||||
const std::string get_filetype(const std::string path);
|
||||
|
||||
/// Get sox_signalinfo_t for passing a py::array object.
|
||||
sox_signalinfo_t get_signalinfo(
|
||||
const py::array* waveform,
|
||||
const int64_t sample_rate,
|
||||
const std::string filetype,
|
||||
const bool channels_first);
|
||||
|
||||
/// Get sox_encodinginfo_t for Tensor I/O
|
||||
sox_encodinginfo_t get_tensor_encodinginfo(const py::dtype dtype);
|
||||
|
||||
/// Get sox_encodinginfo_t for saving to file/file object
|
||||
sox_encodinginfo_t get_encodinginfo_for_save(
|
||||
const std::string& format,
|
||||
const py::dtype dtype,
|
||||
const tl::optional<double> compression,
|
||||
const tl::optional<std::string> encoding,
|
||||
const tl::optional<int64_t> bits_per_sample);
|
||||
|
||||
} // namespace paddleaudio
|
||||
} // namespace sox_utils
|
@ -1,33 +0,0 @@
|
||||
namespace paddleaudio {
|
||||
|
||||
namespace {
|
||||
|
||||
bool is_sox_available() {
|
||||
#ifdef INCLUDE_SOX
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_kaldi_available() {
|
||||
#ifdef INCLUDE_KALDI
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// It tells whether paddleaudio was compiled with ffmpeg
|
||||
// not the runtime availability.
|
||||
bool is_ffmpeg_available() {
|
||||
#ifdef USE_FFMPEG
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
} // namespace paddleaudio
|
@ -1,2 +0,0 @@
|
||||
archives/
|
||||
install/
|
@ -1,15 +0,0 @@
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
|
||||
|
||||
################################################################################
|
||||
# sox
|
||||
################################################################################
|
||||
if (BUILD_SOX)
|
||||
add_subdirectory(sox)
|
||||
endif()
|
||||
|
||||
################################################################################
|
||||
# kaldi
|
||||
################################################################################
|
||||
if (BUILD_KALDI)
|
||||
add_subdirectory(kaldi)
|
||||
endif()
|
@ -1,117 +0,0 @@
|
||||
# checkout the thirdparty/kaldi/base/kaldi-types.h
|
||||
# compile kaldi without openfst
|
||||
add_definitions("-DCOMPILE_WITHOUT_OPENFST")
|
||||
|
||||
# function (define_library name source include_dirs link_libraries compile_defs)
|
||||
# add_library(${name} INTERFACE ${source})
|
||||
# target_include_directories(${name} INTERFACE ${include_dirs})
|
||||
# target_link_libraries(${name} INTERFACE ${link_libraries})
|
||||
# target_compile_definitions(${name} INTERFACE ${compile_defs})
|
||||
# set_target_properties(${name} PROPERTIES PREFIX "")
|
||||
# if (MSVC)
|
||||
# set_target_properties(${name} PROPERTIES SUFFIX ".pyd")
|
||||
# endif(MSVC)
|
||||
# install(
|
||||
# TARGETS ${name}
|
||||
# LIBRARY DESTINATION lib
|
||||
# RUNTIME DESTINATION lib # For Windows
|
||||
# )
|
||||
# endfunction()
|
||||
|
||||
# kaldi-base
|
||||
add_library(kaldi-base STATIC
|
||||
base/io-funcs.cc
|
||||
base/kaldi-error.cc
|
||||
base/kaldi-math.cc
|
||||
base/kaldi-utils.cc
|
||||
base/timer.cc
|
||||
)
|
||||
target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
|
||||
# kaldi-matrix
|
||||
add_library(kaldi-matrix STATIC
|
||||
matrix/compressed-matrix.cc
|
||||
matrix/matrix-functions.cc
|
||||
matrix/kaldi-matrix.cc
|
||||
matrix/kaldi-vector.cc
|
||||
matrix/optimization.cc
|
||||
matrix/packed-matrix.cc
|
||||
matrix/qr.cc
|
||||
matrix/sparse-matrix.cc
|
||||
matrix/sp-matrix.cc
|
||||
matrix/srfft.cc
|
||||
matrix/tp-matrix.cc
|
||||
)
|
||||
target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(kaldi-matrix PUBLIC gfortran kaldi-base libopenblas)
|
||||
|
||||
|
||||
# kaldi-util
|
||||
add_library(kaldi-util STATIC
|
||||
util/kaldi-holder.cc
|
||||
util/kaldi-io.cc
|
||||
util/kaldi-semaphore.cc
|
||||
util/kaldi-table.cc
|
||||
util/kaldi-thread.cc
|
||||
util/parse-options.cc
|
||||
util/simple-io-funcs.cc
|
||||
util/simple-options.cc
|
||||
util/text-utils.cc
|
||||
)
|
||||
target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
|
||||
|
||||
|
||||
# kaldi-feat-common
|
||||
add_library(kaldi-feat-common STATIC
|
||||
feat/cmvn.cc
|
||||
feat/feature-functions.cc
|
||||
feat/feature-window.cc
|
||||
feat/mel-computations.cc
|
||||
feat/pitch-functions.cc
|
||||
feat/resample.cc
|
||||
feat/signal.cc
|
||||
feat/wave-reader.cc
|
||||
)
|
||||
target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
|
||||
|
||||
|
||||
# kaldi-mfcc
|
||||
add_library(kaldi-mfcc STATIC
|
||||
feat/feature-mfcc.cc
|
||||
)
|
||||
target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
|
||||
|
||||
|
||||
# kaldi-fbank
|
||||
add_library(kaldi-fbank STATIC
|
||||
feat/feature-fbank.cc
|
||||
)
|
||||
target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
|
||||
|
||||
|
||||
set(KALDI_LIBRARIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
|
||||
)
|
||||
|
||||
add_library(libkaldi INTERFACE)
|
||||
add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
|
||||
target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(libkaldi INTERFACE
|
||||
# --whole-archive for undefined symbol when link static lib into shared lib
|
||||
-Wl,--start-group -Wl,--whole-archive
|
||||
${KALDI_LIBRARIES}
|
||||
libopenblas
|
||||
gfortran
|
||||
-Wl,--no-whole-archive -Wl,--end-group
|
||||
)
|
||||
target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
|
@ -1 +0,0 @@
|
||||
../../../../speechx/speechx/kaldi/base
|
@ -1 +0,0 @@
|
||||
../../../../speechx/speechx/kaldi/feat
|
@ -1 +0,0 @@
|
||||
../../../../speechx/speechx/kaldi/matrix
|
@ -1 +0,0 @@
|
||||
../../../../speechx/speechx/kaldi/util
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,86 +0,0 @@
|
||||
See the followings for the origin of this patch
|
||||
http://www.linuxfromscratch.org/blfs/view/svn/multimedia/libmad.html
|
||||
http://www.linuxfromscratch.org/patches/blfs/svn/libmad-0.15.1b-fixes-1.patch
|
||||
--- src/libmad/configure 2004-02-05 09:34:07.000000000 +0000
|
||||
+++ src/libmad/configure.new 2020-06-30 21:10:28.528018931 +0000
|
||||
@@ -19083,71 +19083,7 @@
|
||||
|
||||
if test "$GCC" = yes
|
||||
then
|
||||
- if test -z "$arch"
|
||||
- then
|
||||
- case "$host" in
|
||||
- i386-*) ;;
|
||||
- i?86-*) arch="-march=i486" ;;
|
||||
- arm*-empeg-*) arch="-march=armv4 -mtune=strongarm1100" ;;
|
||||
- armv4*-*) arch="-march=armv4 -mtune=strongarm" ;;
|
||||
- powerpc-*) ;;
|
||||
- mips*-agenda-*) arch="-mcpu=vr4100" ;;
|
||||
- mips*-luxsonor-*) arch="-mips1 -mcpu=r3000 -Wa,-m4010" ;;
|
||||
- esac
|
||||
- fi
|
||||
-
|
||||
- case "$optimize" in
|
||||
- -O|"-O "*)
|
||||
- optimize="-O"
|
||||
- optimize="$optimize -fforce-mem"
|
||||
- optimize="$optimize -fforce-addr"
|
||||
- : #x optimize="$optimize -finline-functions"
|
||||
- : #- optimize="$optimize -fstrength-reduce"
|
||||
- optimize="$optimize -fthread-jumps"
|
||||
- optimize="$optimize -fcse-follow-jumps"
|
||||
- optimize="$optimize -fcse-skip-blocks"
|
||||
- : #x optimize="$optimize -frerun-cse-after-loop"
|
||||
- : #x optimize="$optimize -frerun-loop-opt"
|
||||
- : #x optimize="$optimize -fgcse"
|
||||
- optimize="$optimize -fexpensive-optimizations"
|
||||
- optimize="$optimize -fregmove"
|
||||
- : #* optimize="$optimize -fdelayed-branch"
|
||||
- : #x optimize="$optimize -fschedule-insns"
|
||||
- optimize="$optimize -fschedule-insns2"
|
||||
- : #? optimize="$optimize -ffunction-sections"
|
||||
- : #? optimize="$optimize -fcaller-saves"
|
||||
- : #> optimize="$optimize -funroll-loops"
|
||||
- : #> optimize="$optimize -funroll-all-loops"
|
||||
- : #x optimize="$optimize -fmove-all-movables"
|
||||
- : #x optimize="$optimize -freduce-all-givs"
|
||||
- : #? optimize="$optimize -fstrict-aliasing"
|
||||
- : #* optimize="$optimize -fstructure-noalias"
|
||||
-
|
||||
- case "$host" in
|
||||
- arm*-*)
|
||||
- optimize="$optimize -fstrength-reduce"
|
||||
- ;;
|
||||
- mips*-*)
|
||||
- optimize="$optimize -fstrength-reduce"
|
||||
- optimize="$optimize -finline-functions"
|
||||
- ;;
|
||||
- i?86-*)
|
||||
- optimize="$optimize -fstrength-reduce"
|
||||
- ;;
|
||||
- powerpc-apple-*)
|
||||
- # this triggers an internal compiler error with gcc2
|
||||
- : #optimize="$optimize -fstrength-reduce"
|
||||
-
|
||||
- # this is really only beneficial with gcc3
|
||||
- : #optimize="$optimize -finline-functions"
|
||||
- ;;
|
||||
- *)
|
||||
- # this sometimes provokes bugs in gcc 2.95.2
|
||||
- : #optimize="$optimize -fstrength-reduce"
|
||||
- ;;
|
||||
- esac
|
||||
- ;;
|
||||
- esac
|
||||
+ optimize="-O2"
|
||||
fi
|
||||
|
||||
case "$host" in
|
||||
@@ -21497,6 +21433,7 @@
|
||||
then
|
||||
case "$host" in
|
||||
i?86-*) FPM="INTEL" ;;
|
||||
+ x86_64*) FPM="64BIT" ;;
|
||||
arm*-*) FPM="ARM" ;;
|
||||
mips*-*) FPM="MIPS" ;;
|
||||
sparc*-*) FPM="SPARC" ;;
|
@ -1,16 +0,0 @@
|
||||
See https://github.com/pytorch/audio/pull/1297
|
||||
diff -ru sox/src/formats.c sox/src/formats.c
|
||||
--- sox/src/formats.c 2014-10-26 19:55:50.000000000 -0700
|
||||
+++ sox/src/formats.c 2021-02-22 16:01:02.833144070 -0800
|
||||
@@ -333,6 +333,10 @@
|
||||
assert(ft);
|
||||
if (!ft->fp)
|
||||
return sox_false;
|
||||
- fstat(fileno((FILE*)ft->fp), &st);
|
||||
+ int fd = fileno((FILE*)ft->fp);
|
||||
+ if (fd < 0)
|
||||
+ return sox_false;
|
||||
+ if (fstat(fd, &st) < 0)
|
||||
+ return sox_false;
|
||||
return ((st.st_mode & S_IFMT) == S_IFREG);
|
||||
}
|
@ -1,254 +0,0 @@
|
||||
find_package(PkgConfig REQUIRED)
|
||||
|
||||
include(ExternalProject)
|
||||
|
||||
set(INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../install)
|
||||
set(ARCHIVE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../archives)
|
||||
set(patch_dir ${CMAKE_CURRENT_SOURCE_DIR}/../patches)
|
||||
set(COMMON_ARGS --quiet --disable-shared --enable-static --prefix=${INSTALL_DIR} --with-pic --disable-dependency-tracking --disable-debug --disable-examples --disable-doc)
|
||||
|
||||
# To pass custom environment variables to ExternalProject_Add command,
|
||||
# we need to do `${CMAKE_COMMAND} -E env ${envs} <COMMANAD>`.
|
||||
# https://stackoverflow.com/a/62437353
|
||||
# We constrcut the custom environment variables here
|
||||
set(envs
|
||||
"PKG_CONFIG_PATH=${INSTALL_DIR}/lib/pkgconfig"
|
||||
"LDFLAGS=-L${INSTALL_DIR}/lib $ENV{LDFLAGS}"
|
||||
"CFLAGS=-I${INSTALL_DIR}/include -fvisibility=hidden $ENV{CFLAGS}"
|
||||
)
|
||||
|
||||
if (BUILD_MAD)
|
||||
ExternalProject_Add(mad
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||
URL https://downloads.sourceforge.net/project/mad/libmad/0.15.1b/libmad-0.15.1b.tar.gz
|
||||
URL_HASH SHA256=bbfac3ed6bfbc2823d3775ebb931087371e142bb0e9bb1bee51a76a6e0078690
|
||||
PATCH_COMMAND patch < ${patch_dir}/libmad.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/mad/
|
||||
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/mad/configure ${COMMON_ARGS}
|
||||
DOWNLOAD_NO_PROGRESS ON
|
||||
LOG_DOWNLOAD ON
|
||||
LOG_UPDATE ON
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON
|
||||
)
|
||||
endif (BUILD_MAD)
|
||||
|
||||
ExternalProject_Add(amr
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||
URL https://sourceforge.net/projects/opencore-amr/files/opencore-amr/opencore-amr-0.1.5.tar.gz
|
||||
URL_HASH SHA256=2c006cb9d5f651bfb5e60156dbff6af3c9d35c7bbcc9015308c0aff1e14cd341
|
||||
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/amr/
|
||||
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/amr/configure ${COMMON_ARGS}
|
||||
DOWNLOAD_NO_PROGRESS ON
|
||||
LOG_DOWNLOAD ON
|
||||
LOG_UPDATE ON
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON
|
||||
)
|
||||
|
||||
ExternalProject_Add(lame
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||
URL https://downloads.sourceforge.net/project/lame/lame/3.99/lame-3.99.5.tar.gz
|
||||
URL_HASH SHA256=24346b4158e4af3bd9f2e194bb23eb473c75fb7377011523353196b19b9a23ff
|
||||
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/lame/
|
||||
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/lame/configure ${COMMON_ARGS} --enable-nasm
|
||||
DOWNLOAD_NO_PROGRESS ON
|
||||
LOG_DOWNLOAD ON
|
||||
LOG_UPDATE ON
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON
|
||||
)
|
||||
|
||||
ExternalProject_Add(ogg
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||
URL https://ftp.osuosl.org/pub/xiph/releases/ogg/libogg-1.3.3.tar.gz
|
||||
URL_HASH SHA256=c2e8a485110b97550f453226ec644ebac6cb29d1caef2902c007edab4308d985
|
||||
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/
|
||||
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/ogg/configure ${COMMON_ARGS}
|
||||
DOWNLOAD_NO_PROGRESS ON
|
||||
LOG_DOWNLOAD ON
|
||||
LOG_UPDATE ON
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON
|
||||
)
|
||||
|
||||
ExternalProject_Add(flac
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ogg
|
||||
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||
URL https://ftp.osuosl.org/pub/xiph/releases/flac/flac-1.3.2.tar.xz
|
||||
URL_HASH SHA256=91cfc3ed61dc40f47f050a109b08610667d73477af6ef36dcad31c31a4a8d53f
|
||||
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/flac/
|
||||
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/flac/configure ${COMMON_ARGS} --with-ogg --disable-cpplibs
|
||||
DOWNLOAD_NO_PROGRESS ON
|
||||
LOG_DOWNLOAD ON
|
||||
LOG_UPDATE ON
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON
|
||||
)
|
||||
|
||||
ExternalProject_Add(vorbis
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ogg
|
||||
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||
URL https://ftp.osuosl.org/pub/xiph/releases/vorbis/libvorbis-1.3.6.tar.gz
|
||||
URL_HASH SHA256=6ed40e0241089a42c48604dc00e362beee00036af2d8b3f46338031c9e0351cb
|
||||
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/
|
||||
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/vorbis/configure ${COMMON_ARGS} --with-ogg
|
||||
DOWNLOAD_NO_PROGRESS ON
|
||||
LOG_DOWNLOAD ON
|
||||
LOG_UPDATE ON
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON
|
||||
)
|
||||
|
||||
ExternalProject_Add(opus
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ogg
|
||||
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||
URL https://ftp.osuosl.org/pub/xiph/releases/opus/opus-1.3.1.tar.gz
|
||||
URL_HASH SHA256=65b58e1e25b2a114157014736a3d9dfeaad8d41be1c8179866f144a2fb44ff9d
|
||||
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opus/
|
||||
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opus/configure ${COMMON_ARGS} --with-ogg
|
||||
DOWNLOAD_NO_PROGRESS ON
|
||||
LOG_DOWNLOAD ON
|
||||
LOG_UPDATE ON
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON
|
||||
)
|
||||
|
||||
ExternalProject_Add(opusfile
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS opus
|
||||
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||
URL https://ftp.osuosl.org/pub/xiph/releases/opus/opusfile-0.12.tar.gz
|
||||
URL_HASH SHA256=118d8601c12dd6a44f52423e68ca9083cc9f2bfe72da7a8c1acb22a80ae3550b
|
||||
PATCH_COMMAND cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/
|
||||
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/opusfile/configure ${COMMON_ARGS} --disable-http
|
||||
DOWNLOAD_NO_PROGRESS ON
|
||||
LOG_DOWNLOAD ON
|
||||
LOG_UPDATE ON
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON
|
||||
)
|
||||
|
||||
# OpenMP is by default compiled against GNU OpenMP, which conflicts with the version of OpenMP that PyTorch uses.
|
||||
# See https://github.com/pytorch/audio/pull/1026
|
||||
# TODO: Add flags like https://github.com/suphoff/pytorch_parallel_extension_cpp/blob/master/setup.py
|
||||
set(SOX_OPTIONS
|
||||
--disable-openmp
|
||||
--with-amrnb
|
||||
--with-amrwb
|
||||
--with-flac
|
||||
--with-lame
|
||||
--with-oggvorbis
|
||||
--with-opus
|
||||
--without-alsa
|
||||
--without-ao
|
||||
--without-coreaudio
|
||||
--without-oss
|
||||
--without-id3tag
|
||||
--without-ladspa
|
||||
--without-magic
|
||||
--without-png
|
||||
--without-pulseaudio
|
||||
--without-sndfile
|
||||
--without-sndio
|
||||
--without-sunaudio
|
||||
--without-waveaudio
|
||||
--without-wavpack
|
||||
--without-twolame
|
||||
)
|
||||
|
||||
set(SOX_LIBRARIES
|
||||
${INSTALL_DIR}/lib/libsox.a
|
||||
${INSTALL_DIR}/lib/libopencore-amrnb.a
|
||||
${INSTALL_DIR}/lib/libopencore-amrwb.a
|
||||
${INSTALL_DIR}/lib/libmp3lame.a
|
||||
${INSTALL_DIR}/lib/libFLAC.a
|
||||
${INSTALL_DIR}/lib/libopusfile.a
|
||||
${INSTALL_DIR}/lib/libopus.a
|
||||
${INSTALL_DIR}/lib/libvorbisenc.a
|
||||
${INSTALL_DIR}/lib/libvorbisfile.a
|
||||
${INSTALL_DIR}/lib/libvorbis.a
|
||||
${INSTALL_DIR}/lib/libogg.a
|
||||
)
|
||||
|
||||
set(sox_depends
|
||||
ogg flac vorbis opusfile lame amr
|
||||
)
|
||||
|
||||
if (BUILD_MAD)
|
||||
list(
|
||||
APPEND
|
||||
SOX_OPTIONS
|
||||
--with-mad
|
||||
)
|
||||
list(
|
||||
APPEND
|
||||
SOX_LIBRARIES
|
||||
${INSTALL_DIR}/lib/libmad.a
|
||||
)
|
||||
list(
|
||||
APPEND
|
||||
sox_depends
|
||||
mad
|
||||
)
|
||||
else ()
|
||||
list(
|
||||
APPEND
|
||||
SOX_OPTIONS
|
||||
--without-mad
|
||||
)
|
||||
endif (BUILD_MAD)
|
||||
|
||||
ExternalProject_Add(sox
|
||||
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ${sox_depends}
|
||||
DOWNLOAD_DIR ${ARCHIVE_DIR}
|
||||
URL https://downloads.sourceforge.net/project/sox/sox/14.4.2/sox-14.4.2.tar.bz2
|
||||
URL_HASH SHA256=81a6956d4330e75b5827316e44ae381e6f1e8928003c6aa45896da9041ea149c
|
||||
PATCH_COMMAND patch -p1 < ${patch_dir}/sox.patch && cp ${patch_dir}/config.guess ${patch_dir}/config.sub ${CMAKE_CURRENT_BINARY_DIR}/src/sox/
|
||||
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${envs} ${CMAKE_CURRENT_BINARY_DIR}/src/sox/configure ${COMMON_ARGS} ${SOX_OPTIONS}
|
||||
BUILD_BYPRODUCTS ${SOX_LIBRARIES}
|
||||
DOWNLOAD_NO_PROGRESS ON
|
||||
LOG_DOWNLOAD ON
|
||||
LOG_UPDATE ON
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
LOG_INSTALL ON
|
||||
LOG_MERGED_STDOUTERR ON
|
||||
LOG_OUTPUT_ON_FAILURE ON
|
||||
)
|
||||
|
||||
add_library(libsox INTERFACE)
|
||||
add_dependencies(libsox sox)
|
||||
target_include_directories(libsox INTERFACE ${INSTALL_DIR}/include)
|
||||
target_link_libraries(libsox INTERFACE ${SOX_LIBRARIES})
|
@ -1,101 +0,0 @@
|
||||
from typing import Dict, List
|
||||
|
||||
from paddlespeech.audio._internal import module_utils as _mod_utils
|
||||
from paddlespeech.audio import _paddleaudio
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def set_seed(seed: int):
|
||||
"""Set libsox's PRNG
|
||||
|
||||
Args:
|
||||
seed (int): seed value. valid range is int32.
|
||||
|
||||
See Also:
|
||||
http://sox.sourceforge.net/sox.html
|
||||
"""
|
||||
_paddleaudio.sox_utils_set_seed(seed)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def set_verbosity(verbosity: int):
|
||||
"""Set libsox's verbosity
|
||||
|
||||
Args:
|
||||
verbosity (int): Set verbosity level of libsox.
|
||||
|
||||
* ``1`` failure messages
|
||||
* ``2`` warnings
|
||||
* ``3`` details of processing
|
||||
* ``4``-``6`` increasing levels of debug messages
|
||||
|
||||
See Also:
|
||||
http://sox.sourceforge.net/sox.html
|
||||
"""
|
||||
_paddleaudio.sox_utils_set_verbosity(verbosity)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def set_buffer_size(buffer_size: int):
|
||||
"""Set buffer size for sox effect chain
|
||||
|
||||
Args:
|
||||
buffer_size (int): Set the size in bytes of the buffers used for processing audio.
|
||||
|
||||
See Also:
|
||||
http://sox.sourceforge.net/sox.html
|
||||
"""
|
||||
_paddleaudio.sox_utils_set_buffer_size(buffer_size)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def set_use_threads(use_threads: bool):
|
||||
"""Set multithread option for sox effect chain
|
||||
|
||||
Args:
|
||||
use_threads (bool): When ``True``, enables ``libsox``'s parallel effects channels processing.
|
||||
To use mutlithread, the underlying ``libsox`` has to be compiled with OpenMP support.
|
||||
|
||||
See Also:
|
||||
http://sox.sourceforge.net/sox.html
|
||||
"""
|
||||
_paddleaudio.sox_utils_set_use_threads(use_threads)
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def list_effects() -> Dict[str, str]:
|
||||
"""List the available sox effect names
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: Mapping from ``effect name`` to ``usage``
|
||||
"""
|
||||
return dict(_paddleaudio.sox_utils_list_effects())
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def list_read_formats() -> List[str]:
|
||||
"""List the supported audio formats for read
|
||||
|
||||
Returns:
|
||||
List[str]: List of supported audio formats
|
||||
"""
|
||||
return _paddleaudio.sox_utils_list_read_formats()
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def list_write_formats() -> List[str]:
|
||||
"""List the supported audio formats for write
|
||||
|
||||
Returns:
|
||||
List[str]: List of supported audio formats
|
||||
"""
|
||||
return _paddleaudio.sox_utils_list_write_formats()
|
||||
|
||||
|
||||
@_mod_utils.requires_sox()
|
||||
def get_buffer_size() -> int:
|
||||
"""Get buffer size for sox effect chain
|
||||
|
||||
Returns:
|
||||
int: size in bytes of buffers used for processing audio.
|
||||
"""
|
||||
return _paddleaudio.sox_utils_get_buffer_size()
|
Loading…
Reference in new issue