parent
cfd32d00de
commit
4df081b954
@ -0,0 +1,21 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from . import compliance
|
||||||
|
from . import datasets
|
||||||
|
from . import features
|
||||||
|
from . import functional
|
||||||
|
from . import io
|
||||||
|
from . import metric
|
||||||
|
from . import sox_effects
|
||||||
|
from . import backends
|
@ -0,0 +1,148 @@
|
|||||||
|
import importlib.util
|
||||||
|
import warnings
|
||||||
|
from functools import wraps
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py
|
||||||
|
|
||||||
|
|
||||||
|
def is_module_available(*modules: str) -> bool:
|
||||||
|
r"""Returns if a top-level module with :attr:`name` exists *without**
|
||||||
|
importing it. This is generally safer than try-catch block around a
|
||||||
|
`import X`. It avoids third party libraries breaking assumptions of some of
|
||||||
|
our tests, e.g., setting multiprocessing start method when imported
|
||||||
|
(see librosa/#747, torchvision/#544).
|
||||||
|
"""
|
||||||
|
return all(importlib.util.find_spec(m) is not None for m in modules)
|
||||||
|
|
||||||
|
|
||||||
|
def requires_module(*modules: str):
|
||||||
|
"""Decorate function to give error message if invoked without required optional modules.
|
||||||
|
This decorator is to give better error message to users rather
|
||||||
|
than raising ``NameError: name 'module' is not defined`` at random places.
|
||||||
|
"""
|
||||||
|
missing = [m for m in modules if not is_module_available(m)]
|
||||||
|
|
||||||
|
if not missing:
|
||||||
|
# fall through. If all the modules are available, no need to decorate
|
||||||
|
def decorator(func):
|
||||||
|
return func
|
||||||
|
|
||||||
|
else:
|
||||||
|
req = f"module: {missing[0]}" if len(
|
||||||
|
missing) == 1 else f"modules: {missing}"
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"{func.__module__}.{func.__name__} requires {req}")
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def deprecated(direction: str, version: Optional[str]=None):
|
||||||
|
"""Decorator to add deprecation message
|
||||||
|
Args:
|
||||||
|
direction (str): Migration steps to be given to users.
|
||||||
|
version (str or int): The version when the object will be removed
|
||||||
|
"""
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
message = (
|
||||||
|
f"{func.__module__}.{func.__name__} has been deprecated "
|
||||||
|
f'and will be removed from {"future" if version is None else version} release. '
|
||||||
|
f"{direction}")
|
||||||
|
warnings.warn(message, stacklevel=2)
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def is_kaldi_available():
|
||||||
|
return is_module_available("paddleaudio._paddleaudio")
|
||||||
|
|
||||||
|
|
||||||
|
def requires_kaldi():
|
||||||
|
if is_kaldi_available():
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
return func
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"{func.__module__}.{func.__name__} requires kaldi")
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def _check_soundfile_importable():
|
||||||
|
if not is_module_available("soundfile"):
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
import soundfile # noqa: F401
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
warnings.warn(
|
||||||
|
"Failed to import soundfile. 'soundfile' backend is not available.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
_is_soundfile_importable = _check_soundfile_importable()
|
||||||
|
|
||||||
|
|
||||||
|
def is_soundfile_available():
|
||||||
|
return _is_soundfile_importable
|
||||||
|
|
||||||
|
|
||||||
|
def requires_soundfile():
|
||||||
|
if is_soundfile_available():
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
return func
|
||||||
|
else:
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"{func.__module__}.{func.__name__} requires soundfile")
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
def is_sox_available():
|
||||||
|
return is_module_available("paddleaudio._paddleaudio")
|
||||||
|
|
||||||
|
|
||||||
|
def requires_sox():
|
||||||
|
if is_sox_available():
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
return func
|
||||||
|
else:
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"{func.__module__}.{func.__name__} requires sox")
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
return decorator
|
@ -0,0 +1,26 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .soundfile_backend import depth_convert
|
||||||
|
from .soundfile_backend import soundfile_load
|
||||||
|
from .soundfile_backend import normalize
|
||||||
|
from .soundfile_backend import resample
|
||||||
|
from .soundfile_backend import soundfile_save
|
||||||
|
from .soundfile_backend import to_mono
|
||||||
|
|
||||||
|
from . import utils
|
||||||
|
from .utils import get_audio_backend
|
||||||
|
from .utils import list_audio_backends
|
||||||
|
from .utils import set_audio_backend
|
||||||
|
|
||||||
|
utils._init_audio_backend()
|
@ -0,0 +1,55 @@
|
|||||||
|
# Token form https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification.
|
||||||
|
|
||||||
|
class AudioInfo:
|
||||||
|
"""return of info function.
|
||||||
|
|
||||||
|
This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
|
||||||
|
:ref:`"soundfile" backend with the new interface<soundfile_backend>`.
|
||||||
|
|
||||||
|
:ivar int sample_rate: Sample rate
|
||||||
|
:ivar int num_frames: The number of frames
|
||||||
|
:ivar int num_channels: The number of channels
|
||||||
|
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
|
||||||
|
or when it cannot be accurately inferred.
|
||||||
|
:ivar str encoding: Audio encoding
|
||||||
|
The values encoding can take are one of the following:
|
||||||
|
|
||||||
|
* ``PCM_S``: Signed integer linear PCM
|
||||||
|
* ``PCM_U``: Unsigned integer linear PCM
|
||||||
|
* ``PCM_F``: Floating point linear PCM
|
||||||
|
* ``FLAC``: Flac, Free Lossless Audio Codec
|
||||||
|
* ``ULAW``: Mu-law
|
||||||
|
* ``ALAW``: A-law
|
||||||
|
* ``MP3`` : MP3, MPEG-1 Audio Layer III
|
||||||
|
* ``VORBIS``: OGG Vorbis
|
||||||
|
* ``AMR_WB``: Adaptive Multi-Rate
|
||||||
|
* ``AMR_NB``: Adaptive Multi-Rate Wideband
|
||||||
|
* ``OPUS``: Opus
|
||||||
|
* ``HTK``: Single channel 16-bit PCM
|
||||||
|
* ``UNKNOWN`` : None of above
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
sample_rate: int,
|
||||||
|
num_frames: int,
|
||||||
|
num_channels: int,
|
||||||
|
bits_per_sample: int,
|
||||||
|
encoding: str,
|
||||||
|
):
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.num_frames = num_frames
|
||||||
|
self.num_channels = num_channels
|
||||||
|
self.bits_per_sample = bits_per_sample
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return (
|
||||||
|
f"AudioMetaData("
|
||||||
|
f"sample_rate={self.sample_rate}, "
|
||||||
|
f"num_frames={self.num_frames}, "
|
||||||
|
f"num_channels={self.num_channels}, "
|
||||||
|
f"bits_per_sample={self.bits_per_sample}, "
|
||||||
|
f"encoding={self.encoding}"
|
||||||
|
f")"
|
||||||
|
)
|
@ -0,0 +1,32 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from paddle import Tensor
|
||||||
|
|
||||||
|
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
|
||||||
|
|
||||||
|
|
||||||
|
def load(
|
||||||
|
filepath: Union[str, Path],
|
||||||
|
out: Optional[Tensor]=None,
|
||||||
|
normalization: Union[bool, float, Callable]=True,
|
||||||
|
channels_first: bool=True,
|
||||||
|
num_frames: int=0,
|
||||||
|
offset: int=0,
|
||||||
|
filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
|
||||||
|
raise RuntimeError("No audio I/O backend is available.")
|
||||||
|
|
||||||
|
|
||||||
|
def save(filepath: str,
|
||||||
|
src: Tensor,
|
||||||
|
sample_rate: int,
|
||||||
|
precision: int=16,
|
||||||
|
channels_first: bool=True) -> None:
|
||||||
|
raise RuntimeError("No audio I/O backend is available.")
|
||||||
|
|
||||||
|
|
||||||
|
def info(filepath: str) -> None:
|
||||||
|
raise RuntimeError("No audio I/O backend is available.")
|
@ -0,0 +1,661 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import warnings
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import resampy
|
||||||
|
import soundfile
|
||||||
|
from scipy.io import wavfile
|
||||||
|
|
||||||
|
from ..utils import depth_convert
|
||||||
|
from ..utils import ParameterError
|
||||||
|
from .common import AudioInfo
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'resample',
|
||||||
|
'to_mono',
|
||||||
|
'normalize',
|
||||||
|
'save',
|
||||||
|
'soundfile_save',
|
||||||
|
'load',
|
||||||
|
'soundfile_load',
|
||||||
|
'info',
|
||||||
|
]
|
||||||
|
NORMALMIZE_TYPES = ['linear', 'gaussian']
|
||||||
|
MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
|
||||||
|
RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
|
||||||
|
EPS = 1e-8
|
||||||
|
|
||||||
|
|
||||||
|
def resample(y: np.ndarray,
|
||||||
|
src_sr: int,
|
||||||
|
target_sr: int,
|
||||||
|
mode: str='kaiser_fast') -> np.ndarray:
|
||||||
|
"""Audio resampling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
src_sr (int): Source sample rate.
|
||||||
|
target_sr (int): Target sample rate.
|
||||||
|
mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` resampled to `target_sr`
|
||||||
|
"""
|
||||||
|
|
||||||
|
if mode == 'kaiser_best':
|
||||||
|
warnings.warn(
|
||||||
|
f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
|
||||||
|
we recommend the mode kaiser_fast in large scale audio trainning')
|
||||||
|
|
||||||
|
if not isinstance(y, np.ndarray):
|
||||||
|
raise ParameterError(
|
||||||
|
'Only support numpy np.ndarray, but received y in {type(y)}')
|
||||||
|
|
||||||
|
if mode not in RESAMPLE_MODES:
|
||||||
|
raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
|
||||||
|
|
||||||
|
return resampy.resample(y, src_sr, target_sr, filter=mode)
|
||||||
|
|
||||||
|
|
||||||
|
def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
|
||||||
|
"""Convert sterior audio to mono.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` with mono channel.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if merge_type not in MERGE_TYPES:
|
||||||
|
raise ParameterError(
|
||||||
|
f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
|
||||||
|
)
|
||||||
|
if y.ndim > 2:
|
||||||
|
raise ParameterError(
|
||||||
|
f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}')
|
||||||
|
if y.ndim == 1: # nothing to merge
|
||||||
|
return y
|
||||||
|
|
||||||
|
if merge_type == 'ch0':
|
||||||
|
return y[0]
|
||||||
|
if merge_type == 'ch1':
|
||||||
|
return y[1]
|
||||||
|
if merge_type == 'random':
|
||||||
|
return y[np.random.randint(0, 2)]
|
||||||
|
|
||||||
|
# need to do averaging according to dtype
|
||||||
|
|
||||||
|
if y.dtype == 'float32':
|
||||||
|
y_out = (y[0] + y[1]) * 0.5
|
||||||
|
elif y.dtype == 'int16':
|
||||||
|
y_out = y.astype('int32')
|
||||||
|
y_out = (y_out[0] + y_out[1]) // 2
|
||||||
|
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
|
||||||
|
np.iinfo(y.dtype).max).astype(y.dtype)
|
||||||
|
|
||||||
|
elif y.dtype == 'int8':
|
||||||
|
y_out = y.astype('int16')
|
||||||
|
y_out = (y_out[0] + y_out[1]) // 2
|
||||||
|
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
|
||||||
|
np.iinfo(y.dtype).max).astype(y.dtype)
|
||||||
|
else:
|
||||||
|
raise ParameterError(f'Unsupported dtype: {y.dtype}')
|
||||||
|
return y_out
|
||||||
|
|
||||||
|
|
||||||
|
def soundfile_load_(file: os.PathLike,
|
||||||
|
offset: Optional[float]=None,
|
||||||
|
dtype: str='int16',
|
||||||
|
duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
|
||||||
|
"""Load audio using soundfile library. This function load audio file using libsndfile.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file (os.PathLike): File of waveform.
|
||||||
|
offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
|
||||||
|
dtype (str, optional): Data type of waveform. Defaults to 'int16'.
|
||||||
|
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
|
||||||
|
"""
|
||||||
|
with soundfile.SoundFile(file) as sf_desc:
|
||||||
|
sr_native = sf_desc.samplerate
|
||||||
|
if offset:
|
||||||
|
sf_desc.seek(int(offset * sr_native))
|
||||||
|
if duration is not None:
|
||||||
|
frame_duration = int(duration * sr_native)
|
||||||
|
else:
|
||||||
|
frame_duration = -1
|
||||||
|
y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
|
||||||
|
|
||||||
|
return y, sf_desc.samplerate
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(y: np.ndarray, norm_type: str='linear',
|
||||||
|
mul_factor: float=1.0) -> np.ndarray:
|
||||||
|
"""Normalize an input audio with additional multiplier.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
|
||||||
|
mul_factor (float, optional): Scaling factor. Defaults to 1.0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` after normalization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if norm_type == 'linear':
|
||||||
|
amax = np.max(np.abs(y))
|
||||||
|
factor = 1.0 / (amax + EPS)
|
||||||
|
y = y * factor * mul_factor
|
||||||
|
elif norm_type == 'gaussian':
|
||||||
|
amean = np.mean(y)
|
||||||
|
astd = np.std(y)
|
||||||
|
astd = max(astd, EPS)
|
||||||
|
y = mul_factor * (y - amean) / astd
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
|
||||||
|
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
|
||||||
|
"""Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
sr (int): Sample rate.
|
||||||
|
file (os.PathLike): Path of auido file to save.
|
||||||
|
"""
|
||||||
|
if not file.endswith('.wav'):
|
||||||
|
raise ParameterError(
|
||||||
|
f'only .wav file supported, but dst file name is: {file}')
|
||||||
|
|
||||||
|
if sr <= 0:
|
||||||
|
raise ParameterError(
|
||||||
|
f'Sample rate should be larger than 0, recieved sr = {sr}')
|
||||||
|
|
||||||
|
if y.dtype not in ['int16', 'int8']:
|
||||||
|
warnings.warn(
|
||||||
|
f'input data type is {y.dtype}, will convert data to int16 format before saving'
|
||||||
|
)
|
||||||
|
y_out = depth_convert(y, 'int16')
|
||||||
|
else:
|
||||||
|
y_out = y
|
||||||
|
|
||||||
|
wavfile.write(file, sr, y_out)
|
||||||
|
|
||||||
|
def soundfile_load(
|
||||||
|
file: os.PathLike,
|
||||||
|
sr: Optional[int]=None,
|
||||||
|
mono: bool=True,
|
||||||
|
merge_type: str='average', # ch0,ch1,random,average
|
||||||
|
normal: bool=True,
|
||||||
|
norm_type: str='linear',
|
||||||
|
norm_mul_factor: float=1.0,
|
||||||
|
offset: float=0.0,
|
||||||
|
duration: Optional[int]=None,
|
||||||
|
dtype: str='float32',
|
||||||
|
resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
|
||||||
|
"""Load audio file from disk. This function loads audio from disk using using audio beackend.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file (os.PathLike): Path of auido file to load.
|
||||||
|
sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
|
||||||
|
mono (bool, optional): Return waveform with mono channel. Defaults to True.
|
||||||
|
merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
|
||||||
|
normal (bool, optional): Waveform normalization. Defaults to True.
|
||||||
|
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
|
||||||
|
norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
|
||||||
|
offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
|
||||||
|
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
|
||||||
|
dtype (str, optional): Data type of waveform. Defaults to 'float32'.
|
||||||
|
resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
|
||||||
|
"""
|
||||||
|
|
||||||
|
y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
|
||||||
|
|
||||||
|
if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
|
||||||
|
raise ParameterError(f'audio file {file} looks empty')
|
||||||
|
|
||||||
|
if mono:
|
||||||
|
y = to_mono(y, merge_type)
|
||||||
|
|
||||||
|
if sr is not None and sr != r:
|
||||||
|
y = resample(y, r, sr, mode=resample_mode)
|
||||||
|
r = sr
|
||||||
|
|
||||||
|
if normal:
|
||||||
|
y = normalize(y, norm_type, norm_mul_factor)
|
||||||
|
elif dtype in ['int8', 'int16']:
|
||||||
|
# still need to do normalization, before depth convertion
|
||||||
|
y = normalize(y, 'linear', 1.0)
|
||||||
|
|
||||||
|
y = depth_convert(y, dtype)
|
||||||
|
return y, r
|
||||||
|
|
||||||
|
#the code below token form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py with modificaion.
|
||||||
|
|
||||||
|
def _get_subtype_for_wav(dtype: paddle.dtype, encoding: str, bits_per_sample: int):
|
||||||
|
if not encoding:
|
||||||
|
if not bits_per_sample:
|
||||||
|
subtype = {
|
||||||
|
paddle.uint8: "PCM_U8",
|
||||||
|
paddle.int16: "PCM_16",
|
||||||
|
paddle.int32: "PCM_32",
|
||||||
|
paddle.float32: "FLOAT",
|
||||||
|
paddle.float64: "DOUBLE",
|
||||||
|
}.get(dtype)
|
||||||
|
if not subtype:
|
||||||
|
raise ValueError(f"Unsupported dtype for wav: {dtype}")
|
||||||
|
return subtype
|
||||||
|
if bits_per_sample == 8:
|
||||||
|
return "PCM_U8"
|
||||||
|
return f"PCM_{bits_per_sample}"
|
||||||
|
if encoding == "PCM_S":
|
||||||
|
if not bits_per_sample:
|
||||||
|
return "PCM_32"
|
||||||
|
if bits_per_sample == 8:
|
||||||
|
raise ValueError("wav does not support 8-bit signed PCM encoding.")
|
||||||
|
return f"PCM_{bits_per_sample}"
|
||||||
|
if encoding == "PCM_U":
|
||||||
|
if bits_per_sample in (None, 8):
|
||||||
|
return "PCM_U8"
|
||||||
|
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
|
||||||
|
if encoding == "PCM_F":
|
||||||
|
if bits_per_sample in (None, 32):
|
||||||
|
return "FLOAT"
|
||||||
|
if bits_per_sample == 64:
|
||||||
|
return "DOUBLE"
|
||||||
|
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
|
||||||
|
if encoding == "ULAW":
|
||||||
|
if bits_per_sample in (None, 8):
|
||||||
|
return "ULAW"
|
||||||
|
raise ValueError("wav only supports 8-bit mu-law encoding.")
|
||||||
|
if encoding == "ALAW":
|
||||||
|
if bits_per_sample in (None, 8):
|
||||||
|
return "ALAW"
|
||||||
|
raise ValueError("wav only supports 8-bit a-law encoding.")
|
||||||
|
raise ValueError(f"wav does not support {encoding}.")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
|
||||||
|
if encoding in (None, "PCM_S"):
|
||||||
|
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
|
||||||
|
if encoding in ("PCM_U", "PCM_F"):
|
||||||
|
raise ValueError(f"sph does not support {encoding} encoding.")
|
||||||
|
if encoding == "ULAW":
|
||||||
|
if bits_per_sample in (None, 8):
|
||||||
|
return "ULAW"
|
||||||
|
raise ValueError("sph only supports 8-bit for mu-law encoding.")
|
||||||
|
if encoding == "ALAW":
|
||||||
|
return "ALAW"
|
||||||
|
raise ValueError(f"sph does not support {encoding}.")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_subtype(dtype: paddle.dtype, format: str, encoding: str, bits_per_sample: int):
|
||||||
|
if format == "wav":
|
||||||
|
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
|
||||||
|
if format == "flac":
|
||||||
|
if encoding:
|
||||||
|
raise ValueError("flac does not support encoding.")
|
||||||
|
if not bits_per_sample:
|
||||||
|
return "PCM_16"
|
||||||
|
if bits_per_sample > 24:
|
||||||
|
raise ValueError("flac does not support bits_per_sample > 24.")
|
||||||
|
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
|
||||||
|
if format in ("ogg", "vorbis"):
|
||||||
|
if encoding or bits_per_sample:
|
||||||
|
raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
|
||||||
|
return "VORBIS"
|
||||||
|
if format == "sph":
|
||||||
|
return _get_subtype_for_sphere(encoding, bits_per_sample)
|
||||||
|
if format in ("nis", "nist"):
|
||||||
|
return "PCM_16"
|
||||||
|
raise ValueError(f"Unsupported format: {format}")
|
||||||
|
|
||||||
|
def save(
|
||||||
|
filepath: str,
|
||||||
|
src: paddle.Tensor,
|
||||||
|
sample_rate: int,
|
||||||
|
channels_first: bool = True,
|
||||||
|
compression: Optional[float] = None,
|
||||||
|
format: Optional[str] = None,
|
||||||
|
encoding: Optional[str] = None,
|
||||||
|
bits_per_sample: Optional[int] = None,
|
||||||
|
):
|
||||||
|
"""Save audio data to file.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
The formats this function can handle depend on the soundfile installation.
|
||||||
|
This function is tested on the following formats;
|
||||||
|
|
||||||
|
* WAV
|
||||||
|
|
||||||
|
* 32-bit floating-point
|
||||||
|
* 32-bit signed integer
|
||||||
|
* 16-bit signed integer
|
||||||
|
* 8-bit unsigned integer
|
||||||
|
|
||||||
|
* FLAC
|
||||||
|
* OGG/VORBIS
|
||||||
|
* SPHERE
|
||||||
|
|
||||||
|
Note:
|
||||||
|
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
||||||
|
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath (str or pathlib.Path): Path to audio file.
|
||||||
|
src (paddle.Tensor): Audio data to save. must be 2D tensor.
|
||||||
|
sample_rate (int): sampling rate
|
||||||
|
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
|
||||||
|
otherwise `[time, channel]`.
|
||||||
|
compression (float of None, optional): Not used.
|
||||||
|
It is here only for interface compatibility reson with "sox_io" backend.
|
||||||
|
format (str or None, optional): Override the audio format.
|
||||||
|
When ``filepath`` argument is path-like object, audio format is
|
||||||
|
inferred from file extension. If the file extension is missing or
|
||||||
|
different, you can specify the correct format with this argument.
|
||||||
|
|
||||||
|
When ``filepath`` argument is file-like object,
|
||||||
|
this argument is required.
|
||||||
|
|
||||||
|
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
|
||||||
|
``"flac"`` and ``"sph"``.
|
||||||
|
encoding (str or None, optional): Changes the encoding for supported formats.
|
||||||
|
This argument is effective only for supported formats, sush as
|
||||||
|
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
|
||||||
|
|
||||||
|
- ``"PCM_S"`` (signed integer Linear PCM)
|
||||||
|
- ``"PCM_U"`` (unsigned integer Linear PCM)
|
||||||
|
- ``"PCM_F"`` (floating point PCM)
|
||||||
|
- ``"ULAW"`` (mu-law)
|
||||||
|
- ``"ALAW"`` (a-law)
|
||||||
|
|
||||||
|
bits_per_sample (int or None, optional): Changes the bit depth for the
|
||||||
|
supported formats.
|
||||||
|
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
|
||||||
|
you can change the bit depth.
|
||||||
|
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
|
||||||
|
|
||||||
|
Supported formats/encodings/bit depth/compression are:
|
||||||
|
|
||||||
|
``"wav"``
|
||||||
|
- 32-bit floating-point PCM
|
||||||
|
- 32-bit signed integer PCM
|
||||||
|
- 24-bit signed integer PCM
|
||||||
|
- 16-bit signed integer PCM
|
||||||
|
- 8-bit unsigned integer PCM
|
||||||
|
- 8-bit mu-law
|
||||||
|
- 8-bit a-law
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Default encoding/bit depth is determined by the dtype of
|
||||||
|
the input Tensor.
|
||||||
|
|
||||||
|
``"flac"``
|
||||||
|
- 8-bit
|
||||||
|
- 16-bit (default)
|
||||||
|
- 24-bit
|
||||||
|
|
||||||
|
``"ogg"``, ``"vorbis"``
|
||||||
|
- Doesn't accept changing configuration.
|
||||||
|
|
||||||
|
``"sph"``
|
||||||
|
- 8-bit signed integer PCM
|
||||||
|
- 16-bit signed integer PCM
|
||||||
|
- 24-bit signed integer PCM
|
||||||
|
- 32-bit signed integer PCM (default)
|
||||||
|
- 8-bit mu-law
|
||||||
|
- 8-bit a-law
|
||||||
|
- 16-bit a-law
|
||||||
|
- 24-bit a-law
|
||||||
|
- 32-bit a-law
|
||||||
|
|
||||||
|
"""
|
||||||
|
if src.ndim != 2:
|
||||||
|
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
|
||||||
|
if compression is not None:
|
||||||
|
warnings.warn(
|
||||||
|
'`save` function of "soundfile" backend does not support "compression" parameter. '
|
||||||
|
"The argument is silently ignored."
|
||||||
|
)
|
||||||
|
if hasattr(filepath, "write"):
|
||||||
|
if format is None:
|
||||||
|
raise RuntimeError("`format` is required when saving to file object.")
|
||||||
|
ext = format.lower()
|
||||||
|
else:
|
||||||
|
ext = str(filepath).split(".")[-1].lower()
|
||||||
|
|
||||||
|
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
|
||||||
|
raise ValueError("Invalid bits_per_sample.")
|
||||||
|
if bits_per_sample == 24:
|
||||||
|
warnings.warn(
|
||||||
|
"Saving audio with 24 bits per sample might warp samples near -1. "
|
||||||
|
"Using 16 bits per sample might be able to avoid this."
|
||||||
|
)
|
||||||
|
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
|
||||||
|
|
||||||
|
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
|
||||||
|
# so we extend the extensions manually here
|
||||||
|
if ext in ["nis", "nist", "sph"] and format is None:
|
||||||
|
format = "NIST"
|
||||||
|
|
||||||
|
if channels_first:
|
||||||
|
src = src.t()
|
||||||
|
|
||||||
|
soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
|
||||||
|
|
||||||
|
_SUBTYPE2DTYPE = {
|
||||||
|
"PCM_S8": "int8",
|
||||||
|
"PCM_U8": "uint8",
|
||||||
|
"PCM_16": "int16",
|
||||||
|
"PCM_32": "int32",
|
||||||
|
"FLOAT": "float32",
|
||||||
|
"DOUBLE": "float64",
|
||||||
|
}
|
||||||
|
|
||||||
|
def load(
|
||||||
|
filepath: str,
|
||||||
|
frame_offset: int = 0,
|
||||||
|
num_frames: int = -1,
|
||||||
|
normalize: bool = True,
|
||||||
|
channels_first: bool = True,
|
||||||
|
format: Optional[str] = None,
|
||||||
|
) -> Tuple[paddle.Tensor, int]:
|
||||||
|
"""Load audio data from file.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
The formats this function can handle depend on the soundfile installation.
|
||||||
|
This function is tested on the following formats;
|
||||||
|
|
||||||
|
* WAV
|
||||||
|
|
||||||
|
* 32-bit floating-point
|
||||||
|
* 32-bit signed integer
|
||||||
|
* 16-bit signed integer
|
||||||
|
* 8-bit unsigned integer
|
||||||
|
|
||||||
|
* FLAC
|
||||||
|
* OGG/VORBIS
|
||||||
|
* SPHERE
|
||||||
|
|
||||||
|
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
|
||||||
|
``float32`` dtype and the shape of `[channel, time]`.
|
||||||
|
The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
|
||||||
|
|
||||||
|
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
|
||||||
|
signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
|
||||||
|
by providing ``normalize=False``, this function can return integer Tensor, where the samples
|
||||||
|
are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
|
||||||
|
for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
|
||||||
|
|
||||||
|
``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
|
||||||
|
``flac`` and ``mp3``.
|
||||||
|
For these formats, this function always returns ``float32`` Tensor with values normalized to
|
||||||
|
``[-1.0, 1.0]``.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
||||||
|
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath (path-like object or file-like object):
|
||||||
|
Source of audio data.
|
||||||
|
frame_offset (int, optional):
|
||||||
|
Number of frames to skip before start reading data.
|
||||||
|
num_frames (int, optional):
|
||||||
|
Maximum number of frames to read. ``-1`` reads all the remaining samples,
|
||||||
|
starting from ``frame_offset``.
|
||||||
|
This function may return the less number of frames if there is not enough
|
||||||
|
frames in the given file.
|
||||||
|
normalize (bool, optional):
|
||||||
|
When ``True``, this function always return ``float32``, and sample values are
|
||||||
|
normalized to ``[-1.0, 1.0]``.
|
||||||
|
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
|
||||||
|
integer type.
|
||||||
|
This argument has no effect for formats other than integer WAV type.
|
||||||
|
channels_first (bool, optional):
|
||||||
|
When True, the returned Tensor has dimension `[channel, time]`.
|
||||||
|
Otherwise, the returned Tensor's dimension is `[time, channel]`.
|
||||||
|
format (str or None, optional):
|
||||||
|
Not used. PySoundFile does not accept format hint.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(paddle.Tensor, int): Resulting Tensor and sample rate.
|
||||||
|
If the input file has integer wav format and normalization is off, then it has
|
||||||
|
integer type, else ``float32`` type. If ``channels_first=True``, it has
|
||||||
|
`[channel, time]` else `[time, channel]`.
|
||||||
|
"""
|
||||||
|
with soundfile.SoundFile(filepath, "r") as file_:
|
||||||
|
if file_.format != "WAV" or normalize:
|
||||||
|
dtype = "float32"
|
||||||
|
elif file_.subtype not in _SUBTYPE2DTYPE:
|
||||||
|
raise ValueError(f"Unsupported subtype: {file_.subtype}")
|
||||||
|
else:
|
||||||
|
dtype = _SUBTYPE2DTYPE[file_.subtype]
|
||||||
|
|
||||||
|
frames = file_._prepare_read(frame_offset, None, num_frames)
|
||||||
|
waveform = file_.read(frames, dtype, always_2d=True)
|
||||||
|
sample_rate = file_.samplerate
|
||||||
|
|
||||||
|
waveform = paddle.to_tensor(waveform)
|
||||||
|
if channels_first:
|
||||||
|
waveform = paddle.transpose(waveform, perm=[1,0])
|
||||||
|
return waveform, sample_rate
|
||||||
|
|
||||||
|
|
||||||
|
# Mapping from soundfile subtype to number of bits per sample.
|
||||||
|
# This is mostly heuristical and the value is set to 0 when it is irrelevant
|
||||||
|
# (lossy formats) or when it can't be inferred.
|
||||||
|
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
|
||||||
|
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
|
||||||
|
# the default seems to be 8 bits but it can be compressed further to 4 bits.
|
||||||
|
# The dict is inspired from
|
||||||
|
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
|
||||||
|
_SUBTYPE_TO_BITS_PER_SAMPLE = {
|
||||||
|
"PCM_S8": 8, # Signed 8 bit data
|
||||||
|
"PCM_16": 16, # Signed 16 bit data
|
||||||
|
"PCM_24": 24, # Signed 24 bit data
|
||||||
|
"PCM_32": 32, # Signed 32 bit data
|
||||||
|
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
|
||||||
|
"FLOAT": 32, # 32 bit float data
|
||||||
|
"DOUBLE": 64, # 64 bit float data
|
||||||
|
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
||||||
|
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
|
||||||
|
"IMA_ADPCM": 0, # IMA ADPCM.
|
||||||
|
"MS_ADPCM": 0, # Microsoft ADPCM.
|
||||||
|
"GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
|
||||||
|
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
|
||||||
|
"G721_32": 0, # 32kbs G721 ADPCM encoding.
|
||||||
|
"G723_24": 0, # 24kbs G723 ADPCM encoding.
|
||||||
|
"G723_40": 0, # 40kbs G723 ADPCM encoding.
|
||||||
|
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
|
||||||
|
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
|
||||||
|
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
|
||||||
|
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
|
||||||
|
"DPCM_8": 8, # 8 bit differential PCM (XI only)
|
||||||
|
"DPCM_16": 16, # 16 bit differential PCM (XI only)
|
||||||
|
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
|
||||||
|
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
|
||||||
|
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
|
||||||
|
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
|
||||||
|
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_bit_depth(subtype):
|
||||||
|
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
|
||||||
|
warnings.warn(
|
||||||
|
f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
|
||||||
|
"attribute will be set to 0. If you are seeing this warning, please "
|
||||||
|
"report by opening an issue on github (after checking for existing/closed ones). "
|
||||||
|
"You may otherwise ignore this warning."
|
||||||
|
)
|
||||||
|
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
|
||||||
|
|
||||||
|
_SUBTYPE_TO_ENCODING = {
|
||||||
|
"PCM_S8": "PCM_S",
|
||||||
|
"PCM_16": "PCM_S",
|
||||||
|
"PCM_24": "PCM_S",
|
||||||
|
"PCM_32": "PCM_S",
|
||||||
|
"PCM_U8": "PCM_U",
|
||||||
|
"FLOAT": "PCM_F",
|
||||||
|
"DOUBLE": "PCM_F",
|
||||||
|
"ULAW": "ULAW",
|
||||||
|
"ALAW": "ALAW",
|
||||||
|
"VORBIS": "VORBIS",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_encoding(format: str, subtype: str):
|
||||||
|
if format == "FLAC":
|
||||||
|
return "FLAC"
|
||||||
|
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
|
||||||
|
|
||||||
|
def info(filepath: str, format: Optional[str] = None) -> AudioInfo:
|
||||||
|
"""Get signal information of an audio file.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
|
||||||
|
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath (path-like object or file-like object):
|
||||||
|
Source of audio data.
|
||||||
|
format (str or None, optional):
|
||||||
|
Not used. PySoundFile does not accept format hint.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AudioInfo: meta data of the given audio.
|
||||||
|
|
||||||
|
"""
|
||||||
|
sinfo = soundfile.info(filepath)
|
||||||
|
return AudioInfo(
|
||||||
|
sinfo.samplerate,
|
||||||
|
sinfo.frames,
|
||||||
|
sinfo.channels,
|
||||||
|
bits_per_sample=_get_bit_depth(sinfo.subtype),
|
||||||
|
encoding=_get_encoding(sinfo.format, sinfo.subtype),
|
||||||
|
)
|
@ -0,0 +1,101 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable
|
||||||
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddleaudio
|
||||||
|
from paddle import Tensor
|
||||||
|
from .common import AudioInfo
|
||||||
|
import os
|
||||||
|
|
||||||
|
from paddleaudio._internal import module_utils as _mod_utils
|
||||||
|
|
||||||
|
#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
|
||||||
|
|
||||||
|
def _fail_info(filepath: str, format: Optional[str]) -> AudioInfo:
|
||||||
|
raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
|
||||||
|
|
||||||
|
|
||||||
|
def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioInfo:
|
||||||
|
raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
|
||||||
|
|
||||||
|
|
||||||
|
# Note: need to comply TorchScript syntax -- need annotation and no f-string
|
||||||
|
def _fail_load(
|
||||||
|
filepath: str,
|
||||||
|
frame_offset: int = 0,
|
||||||
|
num_frames: int = -1,
|
||||||
|
normalize: bool = True,
|
||||||
|
channels_first: bool = True,
|
||||||
|
format: Optional[str] = None,
|
||||||
|
) -> Tuple[Tensor, int]:
|
||||||
|
raise RuntimeError("Failed to load audio from {}".format(filepath))
|
||||||
|
|
||||||
|
|
||||||
|
def _fail_load_fileobj(fileobj, *args, **kwargs):
|
||||||
|
raise RuntimeError(f"Failed to load audio from {fileobj}")
|
||||||
|
|
||||||
|
_fallback_info = _fail_info
|
||||||
|
_fallback_info_fileobj = _fail_info_fileobj
|
||||||
|
_fallback_load = _fail_load
|
||||||
|
_fallback_load_filebj = _fail_load_fileobj
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def load(
|
||||||
|
filepath: str,
|
||||||
|
frame_offset: int = 0,
|
||||||
|
num_frames: int=-1,
|
||||||
|
normalize: bool = True,
|
||||||
|
channels_first: bool = True,
|
||||||
|
format: Optional[str]=None, ) -> Tuple[Tensor, int]:
|
||||||
|
if hasattr(filepath, "read"):
|
||||||
|
ret = paddleaudio._paddleaudio.load_audio_fileobj(
|
||||||
|
filepath, frame_offset, num_frames, normalize, channels_first, format
|
||||||
|
)
|
||||||
|
if ret is not None:
|
||||||
|
audio_tensor = paddle.to_tensor(ret[0])
|
||||||
|
return (audio_tensor, ret[1])
|
||||||
|
return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format)
|
||||||
|
filepath = os.fspath(filepath)
|
||||||
|
ret = paddleaudio._paddleaudio.sox_io_load_audio_file(
|
||||||
|
filepath, frame_offset, num_frames, normalize, channels_first, format
|
||||||
|
)
|
||||||
|
if ret is not None:
|
||||||
|
audio_tensor = paddle.to_tensor(ret[0])
|
||||||
|
return (audio_tensor, ret[1])
|
||||||
|
return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
|
||||||
|
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def save(filepath: str,
|
||||||
|
src: Tensor,
|
||||||
|
sample_rate: int,
|
||||||
|
channels_first: bool = True,
|
||||||
|
compression: Optional[float] = None,
|
||||||
|
format: Optional[str] = None,
|
||||||
|
encoding: Optional[str] = None,
|
||||||
|
bits_per_sample: Optional[int] = None,
|
||||||
|
):
|
||||||
|
src_arr = src.numpy()
|
||||||
|
if hasattr(filepath, "write"):
|
||||||
|
paddleaudio._paddleaudio.save_audio_fileobj(
|
||||||
|
filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
|
||||||
|
)
|
||||||
|
return
|
||||||
|
filepath = os.fspath(filepath)
|
||||||
|
paddleaudio._paddleaudio.sox_io_save_audio_file(
|
||||||
|
filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
|
||||||
|
)
|
||||||
|
|
||||||
|
@_mod_utils.requires_sox()
|
||||||
|
def info(filepath: str, format: Optional[str] = None,) -> AudioInfo:
|
||||||
|
if hasattr(filepath, "read"):
|
||||||
|
sinfo = paddleaudio._paddleaudio.get_info_fileobj(filepath, format)
|
||||||
|
if sinfo is not None:
|
||||||
|
return AudioInfo(*sinfo)
|
||||||
|
return _fallback_info_fileobj(filepath, format)
|
||||||
|
filepath = os.fspath(filepath)
|
||||||
|
sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format)
|
||||||
|
if sinfo is not None:
|
||||||
|
return AudioInfo(*sinfo)
|
||||||
|
return _fallback_info(filepath, format)
|
@ -0,0 +1,81 @@
|
|||||||
|
"""Defines utilities for switching audio backends"""
|
||||||
|
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
|
||||||
|
|
||||||
|
import warnings
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import paddleaudio
|
||||||
|
from paddleaudio._internal import module_utils as _mod_utils
|
||||||
|
|
||||||
|
from . import no_backend, soundfile_backend, sox_io_backend
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"list_audio_backends",
|
||||||
|
"get_audio_backend",
|
||||||
|
"set_audio_backend",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def list_audio_backends() -> List[str]:
|
||||||
|
"""List available backends
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: The list of available backends.
|
||||||
|
"""
|
||||||
|
backends = []
|
||||||
|
if _mod_utils.is_module_available("soundfile"):
|
||||||
|
backends.append("soundfile")
|
||||||
|
if _mod_utils.is_sox_available():
|
||||||
|
backends.append("sox_io")
|
||||||
|
return backends
|
||||||
|
|
||||||
|
|
||||||
|
def set_audio_backend(backend: Optional[str]):
|
||||||
|
"""Set the backend for I/O operation
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backend (str or None): Name of the backend.
|
||||||
|
One of ``"sox_io"`` or ``"soundfile"`` based on availability
|
||||||
|
of the system. If ``None`` is provided the current backend is unassigned.
|
||||||
|
"""
|
||||||
|
if backend is not None and backend not in list_audio_backends():
|
||||||
|
raise RuntimeError(f'Backend "{backend}" is not one of '
|
||||||
|
f"available backends: {list_audio_backends()}.")
|
||||||
|
|
||||||
|
if backend is None:
|
||||||
|
module = no_backend
|
||||||
|
elif backend == "sox_io":
|
||||||
|
module = sox_io_backend
|
||||||
|
elif backend == "soundfile":
|
||||||
|
module = soundfile_backend
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f'Unexpected backend "{backend}"')
|
||||||
|
|
||||||
|
for func in ["save", "load", "info"]:
|
||||||
|
setattr(paddleaudio, func, getattr(module, func))
|
||||||
|
|
||||||
|
def _init_audio_backend():
|
||||||
|
backends = list_audio_backends()
|
||||||
|
if "soundfile" in backends:
|
||||||
|
set_audio_backend("soundfile")
|
||||||
|
elif "sox_io" in backends:
|
||||||
|
set_audio_backend("sox_io")
|
||||||
|
else:
|
||||||
|
warnings.warn("No audio backend is available.")
|
||||||
|
set_audio_backend(None)
|
||||||
|
|
||||||
|
|
||||||
|
def get_audio_backend() -> Optional[str]:
|
||||||
|
"""Get the name of the current backend
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
|
||||||
|
"""
|
||||||
|
if paddleaudio.load == no_backend.load:
|
||||||
|
return None
|
||||||
|
if paddleaudio.load == sox_io_backend.load:
|
||||||
|
return "sox_io"
|
||||||
|
if paddleaudio.load == soundfile_backend.load:
|
||||||
|
return "soundfile"
|
||||||
|
raise ValueError("Unknown backend.")
|
@ -0,0 +1,15 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from . import kaldi
|
||||||
|
from . import librosa
|
@ -0,0 +1,638 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from torchaudio(https://github.com/pytorch/audio)
|
||||||
|
import math
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import Tensor
|
||||||
|
|
||||||
|
from ..functional import create_dct
|
||||||
|
from ..functional.window import get_window
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'spectrogram',
|
||||||
|
'fbank',
|
||||||
|
'mfcc',
|
||||||
|
]
|
||||||
|
|
||||||
|
# window types
|
||||||
|
HANNING = 'hann'
|
||||||
|
HAMMING = 'hamming'
|
||||||
|
POVEY = 'povey'
|
||||||
|
RECTANGULAR = 'rect'
|
||||||
|
BLACKMAN = 'blackman'
|
||||||
|
|
||||||
|
|
||||||
|
def _get_epsilon(dtype):
|
||||||
|
return paddle.to_tensor(1e-07, dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def _next_power_of_2(x: int) -> int:
|
||||||
|
return 1 if x == 0 else 2**(x - 1).bit_length()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_strided(waveform: Tensor,
|
||||||
|
window_size: int,
|
||||||
|
window_shift: int,
|
||||||
|
snip_edges: bool) -> Tensor:
|
||||||
|
assert waveform.dim() == 1
|
||||||
|
num_samples = waveform.shape[0]
|
||||||
|
|
||||||
|
if snip_edges:
|
||||||
|
if num_samples < window_size:
|
||||||
|
return paddle.empty((0, 0), dtype=waveform.dtype)
|
||||||
|
else:
|
||||||
|
m = 1 + (num_samples - window_size) // window_shift
|
||||||
|
else:
|
||||||
|
reversed_waveform = paddle.flip(waveform, [0])
|
||||||
|
m = (num_samples + (window_shift // 2)) // window_shift
|
||||||
|
pad = window_size // 2 - window_shift // 2
|
||||||
|
pad_right = reversed_waveform
|
||||||
|
if pad > 0:
|
||||||
|
pad_left = reversed_waveform[-pad:]
|
||||||
|
waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
|
||||||
|
else:
|
||||||
|
waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
|
||||||
|
|
||||||
|
return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
|
||||||
|
|
||||||
|
|
||||||
|
def _feature_window_function(
|
||||||
|
window_type: str,
|
||||||
|
window_size: int,
|
||||||
|
blackman_coeff: float,
|
||||||
|
dtype: int, ) -> Tensor:
|
||||||
|
if window_type == HANNING:
|
||||||
|
return get_window('hann', window_size, fftbins=False, dtype=dtype)
|
||||||
|
elif window_type == HAMMING:
|
||||||
|
return get_window('hamming', window_size, fftbins=False, dtype=dtype)
|
||||||
|
elif window_type == POVEY:
|
||||||
|
return get_window(
|
||||||
|
'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
|
||||||
|
elif window_type == RECTANGULAR:
|
||||||
|
return paddle.ones([window_size], dtype=dtype)
|
||||||
|
elif window_type == BLACKMAN:
|
||||||
|
a = 2 * math.pi / (window_size - 1)
|
||||||
|
window_function = paddle.arange(window_size, dtype=dtype)
|
||||||
|
return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
|
||||||
|
(0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
|
||||||
|
).astype(dtype)
|
||||||
|
else:
|
||||||
|
raise Exception('Invalid window type ' + window_type)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
|
||||||
|
energy_floor: float) -> Tensor:
|
||||||
|
log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
|
||||||
|
if energy_floor == 0.0:
|
||||||
|
return log_energy
|
||||||
|
return paddle.maximum(
|
||||||
|
log_energy,
|
||||||
|
paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_waveform_and_window_properties(
|
||||||
|
waveform: Tensor,
|
||||||
|
channel: int,
|
||||||
|
sr: int,
|
||||||
|
frame_shift: float,
|
||||||
|
frame_length: float,
|
||||||
|
round_to_power_of_two: bool,
|
||||||
|
preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
|
||||||
|
channel = max(channel, 0)
|
||||||
|
assert channel < waveform.shape[0], (
|
||||||
|
'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
|
||||||
|
waveform = waveform[channel, :] # size (n)
|
||||||
|
window_shift = int(
|
||||||
|
sr * frame_shift *
|
||||||
|
0.001) # pass frame_shift and frame_length in milliseconds
|
||||||
|
window_size = int(sr * frame_length * 0.001)
|
||||||
|
padded_window_size = _next_power_of_2(
|
||||||
|
window_size) if round_to_power_of_two else window_size
|
||||||
|
|
||||||
|
assert 2 <= window_size <= len(waveform), (
|
||||||
|
'choose a window size {} that is [2, {}]'.format(window_size,
|
||||||
|
len(waveform)))
|
||||||
|
assert 0 < window_shift, '`window_shift` must be greater than 0'
|
||||||
|
assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
|
||||||
|
' use `round_to_power_of_two` or change `frame_length`'
|
||||||
|
assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
|
||||||
|
assert sr > 0, '`sr` must be greater than zero'
|
||||||
|
return waveform, window_shift, window_size, padded_window_size
|
||||||
|
|
||||||
|
|
||||||
|
def _get_window(waveform: Tensor,
|
||||||
|
padded_window_size: int,
|
||||||
|
window_size: int,
|
||||||
|
window_shift: int,
|
||||||
|
window_type: str,
|
||||||
|
blackman_coeff: float,
|
||||||
|
snip_edges: bool,
|
||||||
|
raw_energy: bool,
|
||||||
|
energy_floor: float,
|
||||||
|
dither: float,
|
||||||
|
remove_dc_offset: bool,
|
||||||
|
preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
|
||||||
|
dtype = waveform.dtype
|
||||||
|
epsilon = _get_epsilon(dtype)
|
||||||
|
|
||||||
|
# (m, window_size)
|
||||||
|
strided_input = _get_strided(waveform, window_size, window_shift,
|
||||||
|
snip_edges)
|
||||||
|
|
||||||
|
if dither != 0.0:
|
||||||
|
x = paddle.maximum(epsilon,
|
||||||
|
paddle.rand(strided_input.shape, dtype=dtype))
|
||||||
|
rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
|
||||||
|
strided_input = strided_input + rand_gauss * dither
|
||||||
|
|
||||||
|
if remove_dc_offset:
|
||||||
|
row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1)
|
||||||
|
strided_input = strided_input - row_means
|
||||||
|
|
||||||
|
if raw_energy:
|
||||||
|
signal_log_energy = _get_log_energy(strided_input, epsilon,
|
||||||
|
energy_floor) # (m)
|
||||||
|
|
||||||
|
if preemphasis_coefficient != 0.0:
|
||||||
|
offset_strided_input = paddle.nn.functional.pad(
|
||||||
|
strided_input.unsqueeze(0), (1, 0),
|
||||||
|
data_format='NCL',
|
||||||
|
mode='replicate').squeeze(0) # (m, window_size + 1)
|
||||||
|
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
|
||||||
|
-1]
|
||||||
|
|
||||||
|
window_function = _feature_window_function(
|
||||||
|
window_type, window_size, blackman_coeff,
|
||||||
|
dtype).unsqueeze(0) # (1, window_size)
|
||||||
|
strided_input = strided_input * window_function # (m, window_size)
|
||||||
|
|
||||||
|
# (m, padded_window_size)
|
||||||
|
if padded_window_size != window_size:
|
||||||
|
padding_right = padded_window_size - window_size
|
||||||
|
strided_input = paddle.nn.functional.pad(
|
||||||
|
strided_input.unsqueeze(0), (0, padding_right),
|
||||||
|
data_format='NCL',
|
||||||
|
mode='constant',
|
||||||
|
value=0).squeeze(0)
|
||||||
|
|
||||||
|
if not raw_energy:
|
||||||
|
signal_log_energy = _get_log_energy(strided_input, epsilon,
|
||||||
|
energy_floor) # size (m)
|
||||||
|
|
||||||
|
return strided_input, signal_log_energy
|
||||||
|
|
||||||
|
|
||||||
|
def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
|
||||||
|
if subtract_mean:
|
||||||
|
col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
|
||||||
|
tensor = tensor - col_means
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
|
||||||
|
def spectrogram(waveform: Tensor,
|
||||||
|
blackman_coeff: float=0.42,
|
||||||
|
channel: int=-1,
|
||||||
|
dither: float=0.0,
|
||||||
|
energy_floor: float=1.0,
|
||||||
|
frame_length: float=25.0,
|
||||||
|
frame_shift: float=10.0,
|
||||||
|
preemphasis_coefficient: float=0.97,
|
||||||
|
raw_energy: bool=True,
|
||||||
|
remove_dc_offset: bool=True,
|
||||||
|
round_to_power_of_two: bool=True,
|
||||||
|
sr: int=16000,
|
||||||
|
snip_edges: bool=True,
|
||||||
|
subtract_mean: bool=False,
|
||||||
|
window_type: str=POVEY) -> Tensor:
|
||||||
|
"""Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
waveform (Tensor): A waveform tensor with shape `(C, T)`.
|
||||||
|
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||||
|
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||||
|
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||||
|
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||||
|
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||||
|
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||||
|
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||||
|
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||||
|
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||||
|
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||||
|
to FFT. Defaults to True.
|
||||||
|
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||||
|
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||||
|
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||||
|
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||||
|
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
|
||||||
|
depends on frame_length and frame_shift.
|
||||||
|
"""
|
||||||
|
dtype = waveform.dtype
|
||||||
|
epsilon = _get_epsilon(dtype)
|
||||||
|
|
||||||
|
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||||
|
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
|
||||||
|
preemphasis_coefficient)
|
||||||
|
|
||||||
|
strided_input, signal_log_energy = _get_window(
|
||||||
|
waveform, padded_window_size, window_size, window_shift, window_type,
|
||||||
|
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
|
||||||
|
remove_dc_offset, preemphasis_coefficient)
|
||||||
|
|
||||||
|
# (m, padded_window_size // 2 + 1, 2)
|
||||||
|
fft = paddle.fft.rfft(strided_input)
|
||||||
|
|
||||||
|
power_spectrum = paddle.maximum(
|
||||||
|
fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1)
|
||||||
|
power_spectrum[:, 0] = signal_log_energy
|
||||||
|
|
||||||
|
power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
|
||||||
|
return power_spectrum
|
||||||
|
|
||||||
|
|
||||||
|
def _inverse_mel_scale_scalar(mel_freq: float) -> float:
|
||||||
|
return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
|
||||||
|
|
||||||
|
|
||||||
|
def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
|
||||||
|
return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
|
||||||
|
|
||||||
|
|
||||||
|
def _mel_scale_scalar(freq: float) -> float:
|
||||||
|
return 1127.0 * math.log(1.0 + freq / 700.0)
|
||||||
|
|
||||||
|
|
||||||
|
def _mel_scale(freq: Tensor) -> Tensor:
|
||||||
|
return 1127.0 * (1.0 + freq / 700.0).log()
|
||||||
|
|
||||||
|
|
||||||
|
def _vtln_warp_freq(vtln_low_cutoff: float,
|
||||||
|
vtln_high_cutoff: float,
|
||||||
|
low_freq: float,
|
||||||
|
high_freq: float,
|
||||||
|
vtln_warp_factor: float,
|
||||||
|
freq: Tensor) -> Tensor:
|
||||||
|
assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
|
||||||
|
assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
|
||||||
|
l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
|
||||||
|
h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
|
||||||
|
scale = 1.0 / vtln_warp_factor
|
||||||
|
Fl = scale * l
|
||||||
|
Fh = scale * h
|
||||||
|
assert l > low_freq and h < high_freq
|
||||||
|
scale_left = (Fl - low_freq) / (l - low_freq)
|
||||||
|
scale_right = (high_freq - Fh) / (high_freq - h)
|
||||||
|
res = paddle.empty_like(freq)
|
||||||
|
|
||||||
|
outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
|
||||||
|
| paddle.greater_than(freq, paddle.to_tensor(high_freq))
|
||||||
|
before_l = paddle.less_than(freq, paddle.to_tensor(l))
|
||||||
|
before_h = paddle.less_than(freq, paddle.to_tensor(h))
|
||||||
|
after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
|
||||||
|
|
||||||
|
res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
|
||||||
|
res[before_h] = scale * freq[before_h]
|
||||||
|
res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
|
||||||
|
res[outside_low_high_freq] = freq[outside_low_high_freq]
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def _vtln_warp_mel_freq(vtln_low_cutoff: float,
|
||||||
|
vtln_high_cutoff: float,
|
||||||
|
low_freq,
|
||||||
|
high_freq: float,
|
||||||
|
vtln_warp_factor: float,
|
||||||
|
mel_freq: Tensor) -> Tensor:
|
||||||
|
return _mel_scale(
|
||||||
|
_vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
|
||||||
|
vtln_warp_factor, _inverse_mel_scale(mel_freq)))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_mel_banks(num_bins: int,
|
||||||
|
window_length_padded: int,
|
||||||
|
sample_freq: float,
|
||||||
|
low_freq: float,
|
||||||
|
high_freq: float,
|
||||||
|
vtln_low: float,
|
||||||
|
vtln_high: float,
|
||||||
|
vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
|
||||||
|
assert num_bins > 3, 'Must have at least 3 mel bins'
|
||||||
|
assert window_length_padded % 2 == 0
|
||||||
|
num_fft_bins = window_length_padded / 2
|
||||||
|
nyquist = 0.5 * sample_freq
|
||||||
|
|
||||||
|
if high_freq <= 0.0:
|
||||||
|
high_freq += nyquist
|
||||||
|
|
||||||
|
assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
|
||||||
|
('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
|
||||||
|
|
||||||
|
fft_bin_width = sample_freq / window_length_padded
|
||||||
|
mel_low_freq = _mel_scale_scalar(low_freq)
|
||||||
|
mel_high_freq = _mel_scale_scalar(high_freq)
|
||||||
|
|
||||||
|
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
|
||||||
|
|
||||||
|
if vtln_high < 0.0:
|
||||||
|
vtln_high += nyquist
|
||||||
|
|
||||||
|
assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
|
||||||
|
(0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
|
||||||
|
('Bad values in options: vtln-low {} and vtln-high {}, versus '
|
||||||
|
'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
|
||||||
|
|
||||||
|
bin = paddle.arange(num_bins).unsqueeze(1)
|
||||||
|
left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1)
|
||||||
|
center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1)
|
||||||
|
right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1)
|
||||||
|
|
||||||
|
if vtln_warp_factor != 1.0:
|
||||||
|
left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
|
||||||
|
vtln_warp_factor, left_mel)
|
||||||
|
center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
|
||||||
|
high_freq, vtln_warp_factor,
|
||||||
|
center_mel)
|
||||||
|
right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
|
||||||
|
high_freq, vtln_warp_factor, right_mel)
|
||||||
|
|
||||||
|
center_freqs = _inverse_mel_scale(center_mel) # (num_bins)
|
||||||
|
# (1, num_fft_bins)
|
||||||
|
mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
|
||||||
|
|
||||||
|
# (num_bins, num_fft_bins)
|
||||||
|
up_slope = (mel - left_mel) / (center_mel - left_mel)
|
||||||
|
down_slope = (right_mel - mel) / (right_mel - center_mel)
|
||||||
|
|
||||||
|
if vtln_warp_factor == 1.0:
|
||||||
|
bins = paddle.maximum(
|
||||||
|
paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
|
||||||
|
else:
|
||||||
|
bins = paddle.zeros_like(up_slope)
|
||||||
|
up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
|
||||||
|
mel, center_mel)
|
||||||
|
down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
|
||||||
|
mel, right_mel)
|
||||||
|
bins[up_idx] = up_slope[up_idx]
|
||||||
|
bins[down_idx] = down_slope[down_idx]
|
||||||
|
|
||||||
|
return bins, center_freqs
|
||||||
|
|
||||||
|
|
||||||
|
def fbank(waveform: Tensor,
|
||||||
|
blackman_coeff: float=0.42,
|
||||||
|
channel: int=-1,
|
||||||
|
dither: float=0.0,
|
||||||
|
energy_floor: float=1.0,
|
||||||
|
frame_length: float=25.0,
|
||||||
|
frame_shift: float=10.0,
|
||||||
|
high_freq: float=0.0,
|
||||||
|
htk_compat: bool=False,
|
||||||
|
low_freq: float=20.0,
|
||||||
|
n_mels: int=23,
|
||||||
|
preemphasis_coefficient: float=0.97,
|
||||||
|
raw_energy: bool=True,
|
||||||
|
remove_dc_offset: bool=True,
|
||||||
|
round_to_power_of_two: bool=True,
|
||||||
|
sr: int=16000,
|
||||||
|
snip_edges: bool=True,
|
||||||
|
subtract_mean: bool=False,
|
||||||
|
use_energy: bool=False,
|
||||||
|
use_log_fbank: bool=True,
|
||||||
|
use_power: bool=True,
|
||||||
|
vtln_high: float=-500.0,
|
||||||
|
vtln_low: float=100.0,
|
||||||
|
vtln_warp: float=1.0,
|
||||||
|
window_type: str=POVEY) -> Tensor:
|
||||||
|
"""Compute and return filter banks from a waveform. The output is identical to Kaldi's.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
waveform (Tensor): A waveform tensor with shape `(C, T)`.
|
||||||
|
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||||
|
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||||
|
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||||
|
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||||
|
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||||
|
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||||
|
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
|
||||||
|
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
|
||||||
|
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
|
||||||
|
n_mels (int, optional): Number of output mel bins. Defaults to 23.
|
||||||
|
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||||
|
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||||
|
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||||
|
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||||
|
to FFT. Defaults to True.
|
||||||
|
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||||
|
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||||
|
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||||
|
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||||
|
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
|
||||||
|
use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
|
||||||
|
use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
|
||||||
|
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
|
||||||
|
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
|
||||||
|
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
|
||||||
|
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: A filter banks tensor with shape `(m, n_mels)`.
|
||||||
|
"""
|
||||||
|
dtype = waveform.dtype
|
||||||
|
|
||||||
|
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||||
|
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
|
||||||
|
preemphasis_coefficient)
|
||||||
|
|
||||||
|
strided_input, signal_log_energy = _get_window(
|
||||||
|
waveform, padded_window_size, window_size, window_shift, window_type,
|
||||||
|
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
|
||||||
|
remove_dc_offset, preemphasis_coefficient)
|
||||||
|
|
||||||
|
# (m, padded_window_size // 2 + 1)
|
||||||
|
spectrum = paddle.fft.rfft(strided_input).abs()
|
||||||
|
if use_power:
|
||||||
|
spectrum = spectrum.pow(2.)
|
||||||
|
|
||||||
|
# (n_mels, padded_window_size // 2)
|
||||||
|
mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
|
||||||
|
high_freq, vtln_low, vtln_high, vtln_warp)
|
||||||
|
mel_energies = mel_energies.astype(dtype)
|
||||||
|
|
||||||
|
# (n_mels, padded_window_size // 2 + 1)
|
||||||
|
mel_energies = paddle.nn.functional.pad(
|
||||||
|
mel_energies.unsqueeze(0), (0, 1),
|
||||||
|
data_format='NCL',
|
||||||
|
mode='constant',
|
||||||
|
value=0).squeeze(0)
|
||||||
|
|
||||||
|
# (m, n_mels)
|
||||||
|
mel_energies = paddle.mm(spectrum, mel_energies.T)
|
||||||
|
if use_log_fbank:
|
||||||
|
mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
|
||||||
|
|
||||||
|
if use_energy:
|
||||||
|
signal_log_energy = signal_log_energy.unsqueeze(1)
|
||||||
|
if htk_compat:
|
||||||
|
mel_energies = paddle.concat(
|
||||||
|
(mel_energies, signal_log_energy), axis=1)
|
||||||
|
else:
|
||||||
|
mel_energies = paddle.concat(
|
||||||
|
(signal_log_energy, mel_energies), axis=1)
|
||||||
|
|
||||||
|
# (m, n_mels + 1)
|
||||||
|
mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
|
||||||
|
return mel_energies
|
||||||
|
|
||||||
|
|
||||||
|
def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
|
||||||
|
dct_matrix = create_dct(n_mels, n_mels, 'ortho')
|
||||||
|
dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
|
||||||
|
dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc)
|
||||||
|
return dct_matrix
|
||||||
|
|
||||||
|
|
||||||
|
def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
|
||||||
|
i = paddle.arange(n_mfcc)
|
||||||
|
return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
|
||||||
|
cepstral_lifter)
|
||||||
|
|
||||||
|
|
||||||
|
def mfcc(waveform: Tensor,
|
||||||
|
blackman_coeff: float=0.42,
|
||||||
|
cepstral_lifter: float=22.0,
|
||||||
|
channel: int=-1,
|
||||||
|
dither: float=0.0,
|
||||||
|
energy_floor: float=1.0,
|
||||||
|
frame_length: float=25.0,
|
||||||
|
frame_shift: float=10.0,
|
||||||
|
high_freq: float=0.0,
|
||||||
|
htk_compat: bool=False,
|
||||||
|
low_freq: float=20.0,
|
||||||
|
n_mfcc: int=13,
|
||||||
|
n_mels: int=23,
|
||||||
|
preemphasis_coefficient: float=0.97,
|
||||||
|
raw_energy: bool=True,
|
||||||
|
remove_dc_offset: bool=True,
|
||||||
|
round_to_power_of_two: bool=True,
|
||||||
|
sr: int=16000,
|
||||||
|
snip_edges: bool=True,
|
||||||
|
subtract_mean: bool=False,
|
||||||
|
use_energy: bool=False,
|
||||||
|
vtln_high: float=-500.0,
|
||||||
|
vtln_low: float=100.0,
|
||||||
|
vtln_warp: float=1.0,
|
||||||
|
window_type: str=POVEY) -> Tensor:
|
||||||
|
"""Compute and return mel frequency cepstral coefficients from a waveform. The output is
|
||||||
|
identical to Kaldi's.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
waveform (Tensor): A waveform tensor with shape `(C, T)`.
|
||||||
|
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||||
|
cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
|
||||||
|
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||||
|
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||||
|
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||||
|
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||||
|
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||||
|
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
|
||||||
|
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
|
||||||
|
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
|
||||||
|
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
|
||||||
|
n_mels (int, optional): Number of output mel bins. Defaults to 23.
|
||||||
|
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||||
|
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||||
|
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||||
|
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||||
|
to FFT. Defaults to True.
|
||||||
|
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||||
|
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||||
|
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||||
|
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||||
|
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
|
||||||
|
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
|
||||||
|
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
|
||||||
|
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
|
||||||
|
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
|
||||||
|
"""
|
||||||
|
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
|
||||||
|
n_mfcc, n_mels)
|
||||||
|
|
||||||
|
dtype = waveform.dtype
|
||||||
|
|
||||||
|
# (m, n_mels + use_energy)
|
||||||
|
feature = fbank(
|
||||||
|
waveform=waveform,
|
||||||
|
blackman_coeff=blackman_coeff,
|
||||||
|
channel=channel,
|
||||||
|
dither=dither,
|
||||||
|
energy_floor=energy_floor,
|
||||||
|
frame_length=frame_length,
|
||||||
|
frame_shift=frame_shift,
|
||||||
|
high_freq=high_freq,
|
||||||
|
htk_compat=htk_compat,
|
||||||
|
low_freq=low_freq,
|
||||||
|
n_mels=n_mels,
|
||||||
|
preemphasis_coefficient=preemphasis_coefficient,
|
||||||
|
raw_energy=raw_energy,
|
||||||
|
remove_dc_offset=remove_dc_offset,
|
||||||
|
round_to_power_of_two=round_to_power_of_two,
|
||||||
|
sr=sr,
|
||||||
|
snip_edges=snip_edges,
|
||||||
|
subtract_mean=False,
|
||||||
|
use_energy=use_energy,
|
||||||
|
use_log_fbank=True,
|
||||||
|
use_power=True,
|
||||||
|
vtln_high=vtln_high,
|
||||||
|
vtln_low=vtln_low,
|
||||||
|
vtln_warp=vtln_warp,
|
||||||
|
window_type=window_type)
|
||||||
|
|
||||||
|
if use_energy:
|
||||||
|
# (m)
|
||||||
|
signal_log_energy = feature[:, n_mels if htk_compat else 0]
|
||||||
|
mel_offset = int(not htk_compat)
|
||||||
|
feature = feature[:, mel_offset:(n_mels + mel_offset)]
|
||||||
|
|
||||||
|
# (n_mels, n_mfcc)
|
||||||
|
dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
|
||||||
|
|
||||||
|
# (m, n_mfcc)
|
||||||
|
feature = feature.matmul(dct_matrix)
|
||||||
|
|
||||||
|
if cepstral_lifter != 0.0:
|
||||||
|
# (1, n_mfcc)
|
||||||
|
lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
|
||||||
|
feature *= lifter_coeffs.astype(dtype=dtype)
|
||||||
|
|
||||||
|
if use_energy:
|
||||||
|
feature[:, 0] = signal_log_energy
|
||||||
|
|
||||||
|
if htk_compat:
|
||||||
|
energy = feature[:, 0].unsqueeze(1) # (m, 1)
|
||||||
|
feature = feature[:, 1:] # (m, n_mfcc - 1)
|
||||||
|
if not use_energy:
|
||||||
|
energy *= math.sqrt(2)
|
||||||
|
|
||||||
|
feature = paddle.concat((feature, energy), axis=1)
|
||||||
|
|
||||||
|
feature = _subtract_column_mean(feature, subtract_mean)
|
||||||
|
return feature
|
@ -0,0 +1,788 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from librosa(https://github.com/librosa/librosa)
|
||||||
|
import warnings
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import scipy
|
||||||
|
from numpy.lib.stride_tricks import as_strided
|
||||||
|
from scipy import signal
|
||||||
|
|
||||||
|
from ..backends import depth_convert
|
||||||
|
from ..utils import ParameterError
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# dsp
|
||||||
|
'stft',
|
||||||
|
'mfcc',
|
||||||
|
'hz_to_mel',
|
||||||
|
'mel_to_hz',
|
||||||
|
'mel_frequencies',
|
||||||
|
'power_to_db',
|
||||||
|
'compute_fbank_matrix',
|
||||||
|
'melspectrogram',
|
||||||
|
'spectrogram',
|
||||||
|
'mu_encode',
|
||||||
|
'mu_decode',
|
||||||
|
# augmentation
|
||||||
|
'depth_augment',
|
||||||
|
'spect_augment',
|
||||||
|
'random_crop1d',
|
||||||
|
'random_crop2d',
|
||||||
|
'adaptive_spect_augment',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _pad_center(data: np.ndarray, size: int, axis: int=-1,
|
||||||
|
**kwargs) -> np.ndarray:
|
||||||
|
"""Pad an array to a target length along a target axis.
|
||||||
|
|
||||||
|
This differs from `np.pad` by centering the data prior to padding,
|
||||||
|
analogous to `str.center`
|
||||||
|
"""
|
||||||
|
|
||||||
|
kwargs.setdefault("mode", "constant")
|
||||||
|
n = data.shape[axis]
|
||||||
|
lpad = int((size - n) // 2)
|
||||||
|
lengths = [(0, 0)] * data.ndim
|
||||||
|
lengths[axis] = (lpad, int(size - n - lpad))
|
||||||
|
|
||||||
|
if lpad < 0:
|
||||||
|
raise ParameterError(("Target size ({size:d}) must be "
|
||||||
|
"at least input size ({n:d})"))
|
||||||
|
|
||||||
|
return np.pad(data, lengths, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_frames(x: np.ndarray,
|
||||||
|
frame_length: int,
|
||||||
|
hop_length: int,
|
||||||
|
axis: int=-1) -> np.ndarray:
|
||||||
|
"""Slice a data array into (overlapping) frames.
|
||||||
|
|
||||||
|
This function is aligned with librosa.frame
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not isinstance(x, np.ndarray):
|
||||||
|
raise ParameterError(
|
||||||
|
f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
|
||||||
|
|
||||||
|
if x.shape[axis] < frame_length:
|
||||||
|
raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
|
||||||
|
f" for frame_length={frame_length:d}")
|
||||||
|
|
||||||
|
if hop_length < 1:
|
||||||
|
raise ParameterError(f"Invalid hop_length: {hop_length:d}")
|
||||||
|
|
||||||
|
if axis == -1 and not x.flags["F_CONTIGUOUS"]:
|
||||||
|
warnings.warn(f"librosa.util.frame called with axis={axis} "
|
||||||
|
"on a non-contiguous input. This will result in a copy.")
|
||||||
|
x = np.asfortranarray(x)
|
||||||
|
elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
|
||||||
|
warnings.warn(f"librosa.util.frame called with axis={axis} "
|
||||||
|
"on a non-contiguous input. This will result in a copy.")
|
||||||
|
x = np.ascontiguousarray(x)
|
||||||
|
|
||||||
|
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
|
||||||
|
strides = np.asarray(x.strides)
|
||||||
|
|
||||||
|
new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
|
||||||
|
|
||||||
|
if axis == -1:
|
||||||
|
shape = list(x.shape)[:-1] + [frame_length, n_frames]
|
||||||
|
strides = list(strides) + [hop_length * new_stride]
|
||||||
|
|
||||||
|
elif axis == 0:
|
||||||
|
shape = [n_frames, frame_length] + list(x.shape)[1:]
|
||||||
|
strides = [hop_length * new_stride] + list(strides)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
|
||||||
|
|
||||||
|
return as_strided(x, shape=shape, strides=strides)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_audio(y, mono=True) -> bool:
|
||||||
|
"""Determine whether a variable contains valid audio data.
|
||||||
|
|
||||||
|
The audio y must be a np.ndarray, ether 1-channel or two channel
|
||||||
|
"""
|
||||||
|
if not isinstance(y, np.ndarray):
|
||||||
|
raise ParameterError("Audio data must be of type numpy.ndarray")
|
||||||
|
if y.ndim > 2:
|
||||||
|
raise ParameterError(
|
||||||
|
f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
|
||||||
|
|
||||||
|
if mono and y.ndim == 2:
|
||||||
|
raise ParameterError(
|
||||||
|
f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
|
||||||
|
|
||||||
|
if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
|
||||||
|
raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
|
||||||
|
|
||||||
|
if not np.issubdtype(y.dtype, np.floating):
|
||||||
|
raise ParameterError("Audio data must be floating-point")
|
||||||
|
|
||||||
|
if not np.isfinite(y).all():
|
||||||
|
raise ParameterError("Audio buffer is not finite everywhere")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
|
||||||
|
htk: bool=False) -> np.ndarray:
|
||||||
|
"""Convert Hz to Mels.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
|
||||||
|
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Frequency in mels.
|
||||||
|
"""
|
||||||
|
freq = np.asanyarray(frequencies)
|
||||||
|
|
||||||
|
if htk:
|
||||||
|
return 2595.0 * np.log10(1.0 + freq / 700.0)
|
||||||
|
|
||||||
|
# Fill in the linear part
|
||||||
|
f_min = 0.0
|
||||||
|
f_sp = 200.0 / 3
|
||||||
|
|
||||||
|
mels = (freq - f_min) / f_sp
|
||||||
|
|
||||||
|
# Fill in the log-scale part
|
||||||
|
|
||||||
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||||
|
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||||
|
logstep = np.log(6.4) / 27.0 # step size for log region
|
||||||
|
|
||||||
|
if freq.ndim:
|
||||||
|
# If we have array data, vectorize
|
||||||
|
log_t = freq >= min_log_hz
|
||||||
|
mels[log_t] = min_log_mel + \
|
||||||
|
np.log(freq[log_t] / min_log_hz) / logstep
|
||||||
|
elif freq >= min_log_hz:
|
||||||
|
# If we have scalar data, heck directly
|
||||||
|
mels = min_log_mel + np.log(freq / min_log_hz) / logstep
|
||||||
|
|
||||||
|
return mels
|
||||||
|
|
||||||
|
|
||||||
|
def mel_to_hz(mels: Union[float, List[float], np.ndarray],
|
||||||
|
htk: int=False) -> np.ndarray:
|
||||||
|
"""Convert mel bin numbers to frequencies.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mels (Union[float, List[float], np.ndarray]): Frequency in mels.
|
||||||
|
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Frequencies in Hz.
|
||||||
|
"""
|
||||||
|
mel_array = np.asanyarray(mels)
|
||||||
|
|
||||||
|
if htk:
|
||||||
|
return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
|
||||||
|
|
||||||
|
# Fill in the linear scale
|
||||||
|
f_min = 0.0
|
||||||
|
f_sp = 200.0 / 3
|
||||||
|
freqs = f_min + f_sp * mel_array
|
||||||
|
|
||||||
|
# And now the nonlinear scale
|
||||||
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||||
|
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||||
|
logstep = np.log(6.4) / 27.0 # step size for log region
|
||||||
|
|
||||||
|
if mel_array.ndim:
|
||||||
|
# If we have vector data, vectorize
|
||||||
|
log_t = mel_array >= min_log_mel
|
||||||
|
freqs[log_t] = min_log_hz * \
|
||||||
|
np.exp(logstep * (mel_array[log_t] - min_log_mel))
|
||||||
|
elif mel_array >= min_log_mel:
|
||||||
|
# If we have scalar data, check directly
|
||||||
|
freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
|
||||||
|
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
|
def mel_frequencies(n_mels: int=128,
|
||||||
|
fmin: float=0.0,
|
||||||
|
fmax: float=11025.0,
|
||||||
|
htk: bool=False) -> np.ndarray:
|
||||||
|
"""Compute mel frequencies.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n_mels (int, optional): Number of mel bins. Defaults to 128.
|
||||||
|
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
|
||||||
|
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
|
||||||
|
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
|
||||||
|
"""
|
||||||
|
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||||
|
min_mel = hz_to_mel(fmin, htk=htk)
|
||||||
|
max_mel = hz_to_mel(fmax, htk=htk)
|
||||||
|
|
||||||
|
mels = np.linspace(min_mel, max_mel, n_mels)
|
||||||
|
|
||||||
|
return mel_to_hz(mels, htk=htk)
|
||||||
|
|
||||||
|
|
||||||
|
def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
|
||||||
|
"""Compute fourier frequencies.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sr (int): Sample rate.
|
||||||
|
n_fft (int): FFT size.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
|
||||||
|
"""
|
||||||
|
return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_matrix(sr: int,
|
||||||
|
n_fft: int,
|
||||||
|
n_mels: int=128,
|
||||||
|
fmin: float=0.0,
|
||||||
|
fmax: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: str="slaney",
|
||||||
|
dtype: type=np.float32) -> np.ndarray:
|
||||||
|
"""Compute fbank matrix.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sr (int): Sample rate.
|
||||||
|
n_fft (int): FFT size.
|
||||||
|
n_mels (int, optional): Number of mel bins. Defaults to 128.
|
||||||
|
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
|
||||||
|
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||||
|
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||||
|
norm (str, optional): Type of normalization. Defaults to "slaney".
|
||||||
|
dtype (type, optional): Data type. Defaults to np.float32.
|
||||||
|
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
|
||||||
|
"""
|
||||||
|
if norm != "slaney":
|
||||||
|
raise ParameterError('norm must set to slaney')
|
||||||
|
|
||||||
|
if fmax is None:
|
||||||
|
fmax = float(sr) / 2
|
||||||
|
|
||||||
|
# Initialize the weights
|
||||||
|
n_mels = int(n_mels)
|
||||||
|
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
||||||
|
|
||||||
|
# Center freqs of each FFT bin
|
||||||
|
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
|
||||||
|
|
||||||
|
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||||
|
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
|
||||||
|
|
||||||
|
fdiff = np.diff(mel_f)
|
||||||
|
ramps = np.subtract.outer(mel_f, fftfreqs)
|
||||||
|
|
||||||
|
for i in range(n_mels):
|
||||||
|
# lower and upper slopes for all bins
|
||||||
|
lower = -ramps[i] / fdiff[i]
|
||||||
|
upper = ramps[i + 2] / fdiff[i + 1]
|
||||||
|
|
||||||
|
# .. then intersect them with each other and zero
|
||||||
|
weights[i] = np.maximum(0, np.minimum(lower, upper))
|
||||||
|
|
||||||
|
if norm == "slaney":
|
||||||
|
# Slaney-style mel is scaled to be approx constant energy per channel
|
||||||
|
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
|
||||||
|
weights *= enorm[:, np.newaxis]
|
||||||
|
|
||||||
|
# Only check weights if f_mel[0] is positive
|
||||||
|
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
|
||||||
|
# This means we have an empty channel somewhere
|
||||||
|
warnings.warn("Empty filters detected in mel frequency basis. "
|
||||||
|
"Some channels will produce empty responses. "
|
||||||
|
"Try increasing your sampling rate (and fmax) or "
|
||||||
|
"reducing n_mels.")
|
||||||
|
|
||||||
|
return weights
|
||||||
|
|
||||||
|
|
||||||
|
def stft(x: np.ndarray,
|
||||||
|
n_fft: int=2048,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str="hann",
|
||||||
|
center: bool=True,
|
||||||
|
dtype: type=np.complex64,
|
||||||
|
pad_mode: str="reflect") -> np.ndarray:
|
||||||
|
"""Short-time Fourier transform (STFT).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (np.ndarray): Input waveform in one dimension.
|
||||||
|
n_fft (int, optional): FFT size. Defaults to 2048.
|
||||||
|
hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
|
||||||
|
win_length (Optional[int], optional): The size of window. Defaults to None.
|
||||||
|
window (str, optional): A string of window specification. Defaults to "hann".
|
||||||
|
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||||
|
dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
|
||||||
|
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
|
||||||
|
"""
|
||||||
|
_check_audio(x)
|
||||||
|
|
||||||
|
# By default, use the entire frame
|
||||||
|
if win_length is None:
|
||||||
|
win_length = n_fft
|
||||||
|
|
||||||
|
# Set the default hop, if it's not already specified
|
||||||
|
if hop_length is None:
|
||||||
|
hop_length = int(win_length // 4)
|
||||||
|
|
||||||
|
fft_window = signal.get_window(window, win_length, fftbins=True)
|
||||||
|
|
||||||
|
# Pad the window out to n_fft size
|
||||||
|
fft_window = _pad_center(fft_window, n_fft)
|
||||||
|
|
||||||
|
# Reshape so that the window can be broadcast
|
||||||
|
fft_window = fft_window.reshape((-1, 1))
|
||||||
|
|
||||||
|
# Pad the time series so that frames are centered
|
||||||
|
if center:
|
||||||
|
if n_fft > x.shape[-1]:
|
||||||
|
warnings.warn(
|
||||||
|
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
|
||||||
|
)
|
||||||
|
x = np.pad(x, int(n_fft // 2), mode=pad_mode)
|
||||||
|
|
||||||
|
elif n_fft > x.shape[-1]:
|
||||||
|
raise ParameterError(
|
||||||
|
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Window the time series.
|
||||||
|
x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
|
||||||
|
# Pre-allocate the STFT matrix
|
||||||
|
stft_matrix = np.empty(
|
||||||
|
(int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
|
||||||
|
fft = np.fft # use numpy fft as default
|
||||||
|
# Constrain STFT block sizes to 256 KB
|
||||||
|
MAX_MEM_BLOCK = 2**8 * 2**10
|
||||||
|
# how many columns can we fit within MAX_MEM_BLOCK?
|
||||||
|
n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
|
||||||
|
n_columns = max(n_columns, 1)
|
||||||
|
|
||||||
|
for bl_s in range(0, stft_matrix.shape[1], n_columns):
|
||||||
|
bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
|
||||||
|
stft_matrix[:, bl_s:bl_t] = fft.rfft(
|
||||||
|
fft_window * x_frames[:, bl_s:bl_t], axis=0)
|
||||||
|
|
||||||
|
return stft_matrix
|
||||||
|
|
||||||
|
|
||||||
|
def power_to_db(spect: np.ndarray,
|
||||||
|
ref: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=80.0) -> np.ndarray:
|
||||||
|
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spect (np.ndarray): STFT power spectrogram of an input waveform.
|
||||||
|
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||||
|
amin (float, optional): Minimum threshold. Defaults to 1e-10.
|
||||||
|
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Power spectrogram in db scale.
|
||||||
|
"""
|
||||||
|
spect = np.asarray(spect)
|
||||||
|
|
||||||
|
if amin <= 0:
|
||||||
|
raise ParameterError("amin must be strictly positive")
|
||||||
|
|
||||||
|
if np.issubdtype(spect.dtype, np.complexfloating):
|
||||||
|
warnings.warn(
|
||||||
|
"power_to_db was called on complex input so phase "
|
||||||
|
"information will be discarded. To suppress this warning, "
|
||||||
|
"call power_to_db(np.abs(D)**2) instead.")
|
||||||
|
magnitude = np.abs(spect)
|
||||||
|
else:
|
||||||
|
magnitude = spect
|
||||||
|
|
||||||
|
if callable(ref):
|
||||||
|
# User supplied a function to calculate reference power
|
||||||
|
ref_value = ref(magnitude)
|
||||||
|
else:
|
||||||
|
ref_value = np.abs(ref)
|
||||||
|
|
||||||
|
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
|
||||||
|
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
|
||||||
|
|
||||||
|
if top_db is not None:
|
||||||
|
if top_db < 0:
|
||||||
|
raise ParameterError("top_db must be non-negative")
|
||||||
|
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
|
||||||
|
|
||||||
|
return log_spec
|
||||||
|
|
||||||
|
|
||||||
|
def mfcc(x: np.ndarray,
|
||||||
|
sr: int=16000,
|
||||||
|
spect: Optional[np.ndarray]=None,
|
||||||
|
n_mfcc: int=20,
|
||||||
|
dct_type: int=2,
|
||||||
|
norm: str="ortho",
|
||||||
|
lifter: int=0,
|
||||||
|
**kwargs) -> np.ndarray:
|
||||||
|
"""Mel-frequency cepstral coefficients (MFCCs)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (np.ndarray): Input waveform in one dimension.
|
||||||
|
sr (int, optional): Sample rate. Defaults to 16000.
|
||||||
|
spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
|
||||||
|
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
|
||||||
|
dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
|
||||||
|
norm (str, optional): Type of normalization. Defaults to "ortho".
|
||||||
|
lifter (int, optional): Cepstral filtering. Defaults to 0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
|
||||||
|
"""
|
||||||
|
if spect is None:
|
||||||
|
spect = melspectrogram(x, sr=sr, **kwargs)
|
||||||
|
|
||||||
|
M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
|
||||||
|
|
||||||
|
if lifter > 0:
|
||||||
|
factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
|
||||||
|
lifter)
|
||||||
|
return M * factor[:, np.newaxis]
|
||||||
|
elif lifter == 0:
|
||||||
|
return M
|
||||||
|
else:
|
||||||
|
raise ParameterError(
|
||||||
|
f"MFCC lifter={lifter} must be a non-negative number")
|
||||||
|
|
||||||
|
|
||||||
|
def melspectrogram(x: np.ndarray,
|
||||||
|
sr: int=16000,
|
||||||
|
window_size: int=512,
|
||||||
|
hop_length: int=320,
|
||||||
|
n_mels: int=64,
|
||||||
|
fmin: float=50.0,
|
||||||
|
fmax: Optional[float]=None,
|
||||||
|
window: str='hann',
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
power: float=2.0,
|
||||||
|
to_db: bool=True,
|
||||||
|
ref: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=None) -> np.ndarray:
|
||||||
|
"""Compute mel-spectrogram.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (np.ndarray): Input waveform in one dimension.
|
||||||
|
sr (int, optional): Sample rate. Defaults to 16000.
|
||||||
|
window_size (int, optional): Size of FFT and window length. Defaults to 512.
|
||||||
|
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
|
||||||
|
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||||
|
fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
|
||||||
|
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||||
|
window (str, optional): A string of window specification. Defaults to "hann".
|
||||||
|
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||||
|
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
|
||||||
|
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
|
||||||
|
to_db (bool, optional): Enable db scale. Defaults to True.
|
||||||
|
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||||
|
amin (float, optional): Minimum threshold. Defaults to 1e-10.
|
||||||
|
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
|
||||||
|
"""
|
||||||
|
_check_audio(x, mono=True)
|
||||||
|
if len(x) <= 0:
|
||||||
|
raise ParameterError('The input waveform is empty')
|
||||||
|
|
||||||
|
if fmax is None:
|
||||||
|
fmax = sr // 2
|
||||||
|
if fmin < 0 or fmin >= fmax:
|
||||||
|
raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
|
||||||
|
|
||||||
|
s = stft(
|
||||||
|
x,
|
||||||
|
n_fft=window_size,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=window_size,
|
||||||
|
window=window,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode)
|
||||||
|
|
||||||
|
spect_power = np.abs(s)**power
|
||||||
|
fb_matrix = compute_fbank_matrix(
|
||||||
|
sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
||||||
|
mel_spect = np.matmul(fb_matrix, spect_power)
|
||||||
|
if to_db:
|
||||||
|
return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
|
||||||
|
else:
|
||||||
|
return mel_spect
|
||||||
|
|
||||||
|
|
||||||
|
def spectrogram(x: np.ndarray,
|
||||||
|
sr: int=16000,
|
||||||
|
window_size: int=512,
|
||||||
|
hop_length: int=320,
|
||||||
|
window: str='hann',
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
power: float=2.0) -> np.ndarray:
|
||||||
|
"""Compute spectrogram.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (np.ndarray): Input waveform in one dimension.
|
||||||
|
sr (int, optional): Sample rate. Defaults to 16000.
|
||||||
|
window_size (int, optional): Size of FFT and window length. Defaults to 512.
|
||||||
|
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
|
||||||
|
window (str, optional): A string of window specification. Defaults to "hann".
|
||||||
|
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||||
|
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
|
||||||
|
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
s = stft(
|
||||||
|
x,
|
||||||
|
n_fft=window_size,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=window_size,
|
||||||
|
window=window,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode)
|
||||||
|
|
||||||
|
return np.abs(s)**power
|
||||||
|
|
||||||
|
|
||||||
|
def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
|
||||||
|
"""Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (np.ndarray): The input waveform to encode.
|
||||||
|
mu (int, optional): The endoceding parameter. Defaults to 255.
|
||||||
|
quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The mu-law encoded waveform.
|
||||||
|
"""
|
||||||
|
mu = 255
|
||||||
|
y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
|
||||||
|
if quantized:
|
||||||
|
y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1]
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
|
||||||
|
"""Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): The encoded waveform.
|
||||||
|
mu (int, optional): The endoceding parameter. Defaults to 255.
|
||||||
|
quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The mu-law decoded waveform.
|
||||||
|
"""
|
||||||
|
if mu < 1:
|
||||||
|
raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
|
||||||
|
|
||||||
|
mu = mu - 1
|
||||||
|
if quantized: # undo the quantization
|
||||||
|
y = y * 2 / mu - 1
|
||||||
|
x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def _randint(high: int) -> int:
|
||||||
|
"""Generate one random integer in range [0 high)
|
||||||
|
|
||||||
|
This is a helper function for random data augmentaiton
|
||||||
|
"""
|
||||||
|
return int(np.random.randint(0, high=high))
|
||||||
|
|
||||||
|
|
||||||
|
def depth_augment(y: np.ndarray,
|
||||||
|
choices: List=['int8', 'int16'],
|
||||||
|
probs: List[float]=[0.5, 0.5]) -> np.ndarray:
|
||||||
|
""" Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
|
||||||
|
probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The augmented waveform.
|
||||||
|
"""
|
||||||
|
assert len(probs) == len(
|
||||||
|
choices
|
||||||
|
), 'number of choices {} must be equal to size of probs {}'.format(
|
||||||
|
len(choices), len(probs))
|
||||||
|
depth = np.random.choice(choices, p=probs)
|
||||||
|
src_depth = y.dtype
|
||||||
|
y1 = depth_convert(y, depth)
|
||||||
|
y2 = depth_convert(y1, src_depth)
|
||||||
|
|
||||||
|
return y2
|
||||||
|
|
||||||
|
|
||||||
|
def adaptive_spect_augment(spect: np.ndarray,
|
||||||
|
tempo_axis: int=0,
|
||||||
|
level: float=0.1) -> np.ndarray:
|
||||||
|
"""Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spect (np.ndarray): Input spectrogram.
|
||||||
|
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
|
||||||
|
level (float, optional): The level factor of masking. Defaults to 0.1.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The augmented spectrogram.
|
||||||
|
"""
|
||||||
|
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
||||||
|
if tempo_axis == 0:
|
||||||
|
nt, nf = spect.shape
|
||||||
|
else:
|
||||||
|
nf, nt = spect.shape
|
||||||
|
|
||||||
|
time_mask_width = int(nt * level * 0.5)
|
||||||
|
freq_mask_width = int(nf * level * 0.5)
|
||||||
|
|
||||||
|
num_time_mask = int(10 * level)
|
||||||
|
num_freq_mask = int(10 * level)
|
||||||
|
|
||||||
|
if tempo_axis == 0:
|
||||||
|
for _ in range(num_time_mask):
|
||||||
|
start = _randint(nt - time_mask_width)
|
||||||
|
spect[start:start + time_mask_width, :] = 0
|
||||||
|
for _ in range(num_freq_mask):
|
||||||
|
start = _randint(nf - freq_mask_width)
|
||||||
|
spect[:, start:start + freq_mask_width] = 0
|
||||||
|
else:
|
||||||
|
for _ in range(num_time_mask):
|
||||||
|
start = _randint(nt - time_mask_width)
|
||||||
|
spect[:, start:start + time_mask_width] = 0
|
||||||
|
for _ in range(num_freq_mask):
|
||||||
|
start = _randint(nf - freq_mask_width)
|
||||||
|
spect[start:start + freq_mask_width, :] = 0
|
||||||
|
|
||||||
|
return spect
|
||||||
|
|
||||||
|
|
||||||
|
def spect_augment(spect: np.ndarray,
|
||||||
|
tempo_axis: int=0,
|
||||||
|
max_time_mask: int=3,
|
||||||
|
max_freq_mask: int=3,
|
||||||
|
max_time_mask_width: int=30,
|
||||||
|
max_freq_mask_width: int=20) -> np.ndarray:
|
||||||
|
"""Do spectrogram augmentation in both time and freq axis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spect (np.ndarray): Input spectrogram.
|
||||||
|
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
|
||||||
|
max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
|
||||||
|
max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
|
||||||
|
max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
|
||||||
|
max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The augmented spectrogram.
|
||||||
|
"""
|
||||||
|
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
||||||
|
if tempo_axis == 0:
|
||||||
|
nt, nf = spect.shape
|
||||||
|
else:
|
||||||
|
nf, nt = spect.shape
|
||||||
|
|
||||||
|
num_time_mask = _randint(max_time_mask)
|
||||||
|
num_freq_mask = _randint(max_freq_mask)
|
||||||
|
|
||||||
|
time_mask_width = _randint(max_time_mask_width)
|
||||||
|
freq_mask_width = _randint(max_freq_mask_width)
|
||||||
|
|
||||||
|
if tempo_axis == 0:
|
||||||
|
for _ in range(num_time_mask):
|
||||||
|
start = _randint(nt - time_mask_width)
|
||||||
|
spect[start:start + time_mask_width, :] = 0
|
||||||
|
for _ in range(num_freq_mask):
|
||||||
|
start = _randint(nf - freq_mask_width)
|
||||||
|
spect[:, start:start + freq_mask_width] = 0
|
||||||
|
else:
|
||||||
|
for _ in range(num_time_mask):
|
||||||
|
start = _randint(nt - time_mask_width)
|
||||||
|
spect[:, start:start + time_mask_width] = 0
|
||||||
|
for _ in range(num_freq_mask):
|
||||||
|
start = _randint(nf - freq_mask_width)
|
||||||
|
spect[start:start + freq_mask_width, :] = 0
|
||||||
|
|
||||||
|
return spect
|
||||||
|
|
||||||
|
|
||||||
|
def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
|
||||||
|
""" Random cropping on a input waveform.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D.
|
||||||
|
crop_len (int): Length of waveform to crop.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The cropped waveform.
|
||||||
|
"""
|
||||||
|
if y.ndim != 1:
|
||||||
|
'only accept 1d tensor or numpy array'
|
||||||
|
n = len(y)
|
||||||
|
idx = _randint(n - crop_len)
|
||||||
|
return y[idx:idx + crop_len]
|
||||||
|
|
||||||
|
|
||||||
|
def random_crop2d(s: np.ndarray, crop_len: int,
|
||||||
|
tempo_axis: int=0) -> np.ndarray:
|
||||||
|
""" Random cropping on a spectrogram.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
s (np.ndarray): Input spectrogram in 2D.
|
||||||
|
crop_len (int): Length of spectrogram to crop.
|
||||||
|
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: The cropped spectrogram.
|
||||||
|
"""
|
||||||
|
if tempo_axis >= s.ndim:
|
||||||
|
raise ParameterError('axis out of range')
|
||||||
|
|
||||||
|
n = s.shape[tempo_axis]
|
||||||
|
idx = _randint(high=n - crop_len)
|
||||||
|
sli = [slice(None) for i in range(s.ndim)]
|
||||||
|
sli[tempo_axis] = slice(idx, idx + crop_len)
|
||||||
|
out = s[tuple(sli)]
|
||||||
|
return out
|
@ -0,0 +1,20 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .esc50 import ESC50
|
||||||
|
from .gtzan import GTZAN
|
||||||
|
from .hey_snips import HeySnips
|
||||||
|
from .rirs_noises import OpenRIRNoise
|
||||||
|
from .tess import TESS
|
||||||
|
from .urban_sound import UrbanSound8K
|
||||||
|
from .voxceleb import VoxCeleb
|
@ -0,0 +1,100 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from ..backends.soundfile_backend import soundfile_load as load_audio
|
||||||
|
from ..compliance.kaldi import fbank as kaldi_fbank
|
||||||
|
from ..compliance.kaldi import mfcc as kaldi_mfcc
|
||||||
|
from ..compliance.librosa import melspectrogram
|
||||||
|
from ..compliance.librosa import mfcc
|
||||||
|
|
||||||
|
feat_funcs = {
|
||||||
|
'raw': None,
|
||||||
|
'melspectrogram': melspectrogram,
|
||||||
|
'mfcc': mfcc,
|
||||||
|
'kaldi_fbank': kaldi_fbank,
|
||||||
|
'kaldi_mfcc': kaldi_mfcc,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class AudioClassificationDataset(paddle.io.Dataset):
|
||||||
|
"""
|
||||||
|
Base class of audio classification dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
files: List[str],
|
||||||
|
labels: List[int],
|
||||||
|
feat_type: str='raw',
|
||||||
|
sample_rate: int=None,
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
files (:obj:`List[str]`): A list of absolute path of audio files.
|
||||||
|
labels (:obj:`List[int]`): Labels of audio files.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
super(AudioClassificationDataset, self).__init__()
|
||||||
|
|
||||||
|
if feat_type not in feat_funcs.keys():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.files = files
|
||||||
|
self.labels = labels
|
||||||
|
|
||||||
|
self.feat_type = feat_type
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.feat_config = kwargs # Pass keyword arguments to customize feature config
|
||||||
|
|
||||||
|
def _get_data(self, input_file: str):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _convert_to_record(self, idx):
|
||||||
|
file, label = self.files[idx], self.labels[idx]
|
||||||
|
|
||||||
|
if self.sample_rate is None:
|
||||||
|
waveform, sample_rate = load_audio(file)
|
||||||
|
else:
|
||||||
|
waveform, sample_rate = load_audio(file, sr=self.sample_rate)
|
||||||
|
|
||||||
|
feat_func = feat_funcs[self.feat_type]
|
||||||
|
|
||||||
|
record = {}
|
||||||
|
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
|
||||||
|
waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T)
|
||||||
|
record['feat'] = feat_func(
|
||||||
|
waveform=waveform, sr=self.sample_rate, **self.feat_config)
|
||||||
|
else:
|
||||||
|
record['feat'] = feat_func(
|
||||||
|
waveform, sample_rate,
|
||||||
|
**self.feat_config) if feat_func else waveform
|
||||||
|
record['label'] = label
|
||||||
|
return record
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
record = self._convert_to_record(idx)
|
||||||
|
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
|
||||||
|
return self.keys[idx], record['feat'], record['label']
|
||||||
|
else:
|
||||||
|
return np.array(record['feat']).transpose(), np.array(
|
||||||
|
record['label'], dtype=np.int64)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.files)
|
@ -0,0 +1,152 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['ESC50']
|
||||||
|
|
||||||
|
|
||||||
|
class ESC50(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
|
||||||
|
suitable for benchmarking methods of environmental sound classification. The dataset
|
||||||
|
consists of 5-second-long recordings organized into 50 semantical classes (with
|
||||||
|
40 examples per class)
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
ESC: Dataset for Environmental Sound Classification
|
||||||
|
http://dx.doi.org/10.1145/2733373.2806390
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url':
|
||||||
|
'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
|
||||||
|
'md5': '7771e4b9d86d0945acce719c7a59305a',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
# Animals
|
||||||
|
'Dog',
|
||||||
|
'Rooster',
|
||||||
|
'Pig',
|
||||||
|
'Cow',
|
||||||
|
'Frog',
|
||||||
|
'Cat',
|
||||||
|
'Hen',
|
||||||
|
'Insects (flying)',
|
||||||
|
'Sheep',
|
||||||
|
'Crow',
|
||||||
|
# Natural soundscapes & water sounds
|
||||||
|
'Rain',
|
||||||
|
'Sea waves',
|
||||||
|
'Crackling fire',
|
||||||
|
'Crickets',
|
||||||
|
'Chirping birds',
|
||||||
|
'Water drops',
|
||||||
|
'Wind',
|
||||||
|
'Pouring water',
|
||||||
|
'Toilet flush',
|
||||||
|
'Thunderstorm',
|
||||||
|
# Human, non-speech sounds
|
||||||
|
'Crying baby',
|
||||||
|
'Sneezing',
|
||||||
|
'Clapping',
|
||||||
|
'Breathing',
|
||||||
|
'Coughing',
|
||||||
|
'Footsteps',
|
||||||
|
'Laughing',
|
||||||
|
'Brushing teeth',
|
||||||
|
'Snoring',
|
||||||
|
'Drinking, sipping',
|
||||||
|
# Interior/domestic sounds
|
||||||
|
'Door knock',
|
||||||
|
'Mouse click',
|
||||||
|
'Keyboard typing',
|
||||||
|
'Door, wood creaks',
|
||||||
|
'Can opening',
|
||||||
|
'Washing machine',
|
||||||
|
'Vacuum cleaner',
|
||||||
|
'Clock alarm',
|
||||||
|
'Clock tick',
|
||||||
|
'Glass breaking',
|
||||||
|
# Exterior/urban noises
|
||||||
|
'Helicopter',
|
||||||
|
'Chainsaw',
|
||||||
|
'Siren',
|
||||||
|
'Car horn',
|
||||||
|
'Engine',
|
||||||
|
'Train',
|
||||||
|
'Church bells',
|
||||||
|
'Airplane',
|
||||||
|
'Fireworks',
|
||||||
|
'Hand saw',
|
||||||
|
]
|
||||||
|
meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
|
||||||
|
meta_info = collections.namedtuple(
|
||||||
|
'META_INFO',
|
||||||
|
('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
|
||||||
|
audio_path = os.path.join('ESC-50-master', 'audio')
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode: str='train',
|
||||||
|
split: int=1,
|
||||||
|
feat_type: str='raw',
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
split (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
It specify the fold of dev dataset.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
files, labels = self._get_data(mode, split)
|
||||||
|
super(ESC50, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self) -> List[collections.namedtuple]:
|
||||||
|
ret = []
|
||||||
|
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
|
||||||
|
for line in rf.readlines()[1:]:
|
||||||
|
ret.append(self.meta_info(*line.strip().split(',')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
meta_info = self._get_meta_info()
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
for sample in meta_info:
|
||||||
|
filename, fold, target, _, _, _, _ = sample
|
||||||
|
if mode == 'train' and int(fold) != split:
|
||||||
|
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
if mode != 'train' and int(fold) == split:
|
||||||
|
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,115 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['GTZAN']
|
||||||
|
|
||||||
|
|
||||||
|
class GTZAN(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
|
||||||
|
each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
|
||||||
|
in machine listening research for music genre recognition (MGR).
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
Musical genre classification of audio signals
|
||||||
|
https://ieeexplore.ieee.org/document/1021072/
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
|
||||||
|
'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
|
||||||
|
'pop', 'reggae', 'rock'
|
||||||
|
]
|
||||||
|
meta = os.path.join('genres', 'input.mf')
|
||||||
|
meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
|
||||||
|
audio_path = 'genres'
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode='train',
|
||||||
|
seed=0,
|
||||||
|
n_folds=5,
|
||||||
|
split=1,
|
||||||
|
feat_type='raw',
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
seed (:obj:`int`, `optional`, defaults to 0):
|
||||||
|
Set the random seed to shuffle samples.
|
||||||
|
n_folds (:obj:`int`, `optional`, defaults to 5):
|
||||||
|
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
|
||||||
|
split (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
It specify the fold of dev dataset.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
|
||||||
|
files, labels = self._get_data(mode, seed, n_folds, split)
|
||||||
|
super(GTZAN, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self) -> List[collections.namedtuple]:
|
||||||
|
ret = []
|
||||||
|
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
|
||||||
|
for line in rf.readlines():
|
||||||
|
ret.append(self.meta_info(*line.strip().split('\t')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode, seed, n_folds,
|
||||||
|
split) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
meta_info = self._get_meta_info()
|
||||||
|
random.seed(seed) # shuffle samples to split data
|
||||||
|
random.shuffle(
|
||||||
|
meta_info
|
||||||
|
) # make sure using the same seed to create train and dev dataset
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
n_samples_per_fold = len(meta_info) // n_folds
|
||||||
|
for idx, sample in enumerate(meta_info):
|
||||||
|
file_path, label = sample
|
||||||
|
filename = os.path.basename(file_path)
|
||||||
|
target = self.label_list.index(label)
|
||||||
|
fold = idx // n_samples_per_fold + 1
|
||||||
|
|
||||||
|
if mode == 'train' and int(fold) != split:
|
||||||
|
files.append(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, label, filename))
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
if mode != 'train' and int(fold) == split:
|
||||||
|
files.append(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, label, filename))
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,74 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['HeySnips']
|
||||||
|
|
||||||
|
|
||||||
|
class HeySnips(AudioClassificationDataset):
|
||||||
|
meta_info = collections.namedtuple('META_INFO',
|
||||||
|
('key', 'label', 'duration', 'wav'))
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
data_dir: os.PathLike,
|
||||||
|
mode: str='train',
|
||||||
|
feat_type: str='kaldi_fbank',
|
||||||
|
sample_rate: int=16000,
|
||||||
|
**kwargs):
|
||||||
|
self.data_dir = data_dir
|
||||||
|
files, labels = self._get_data(mode)
|
||||||
|
super(HeySnips, self).__init__(
|
||||||
|
files=files,
|
||||||
|
labels=labels,
|
||||||
|
feat_type=feat_type,
|
||||||
|
sample_rate=sample_rate,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self, mode) -> List[collections.namedtuple]:
|
||||||
|
ret = []
|
||||||
|
with open(os.path.join(self.data_dir, '{}.json'.format(mode)),
|
||||||
|
'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
for item in data:
|
||||||
|
sample = collections.OrderedDict()
|
||||||
|
if item['duration'] > 0:
|
||||||
|
sample['key'] = item['id']
|
||||||
|
sample['label'] = 0 if item['is_hotword'] == 1 else -1
|
||||||
|
sample['duration'] = item['duration']
|
||||||
|
sample['wav'] = os.path.join(self.data_dir,
|
||||||
|
item['audio_file_path'])
|
||||||
|
ret.append(self.meta_info(*sample.values()))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
|
||||||
|
meta_info = self._get_meta_info(mode)
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
self.keys = []
|
||||||
|
self.durations = []
|
||||||
|
for sample in meta_info:
|
||||||
|
key, target, duration, wav = sample
|
||||||
|
files.append(wav)
|
||||||
|
labels.append(int(target))
|
||||||
|
self.keys.append(key)
|
||||||
|
self.durations.append(float(duration))
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,201 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from paddle.io import Dataset
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from ..backends.soundfile_backend import soundfile_load as load_audio
|
||||||
|
from ..backends.soundfile_backend import soundfile_save as save_wav
|
||||||
|
from ..utils import DATA_HOME
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from .dataset import feat_funcs
|
||||||
|
|
||||||
|
__all__ = ['OpenRIRNoise']
|
||||||
|
|
||||||
|
|
||||||
|
class OpenRIRNoise(Dataset):
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
|
||||||
|
'md5': 'e6f48e257286e05de56413b4779d8ffb',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
sample_rate = 16000
|
||||||
|
meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
|
||||||
|
base_path = os.path.join(DATA_HOME, 'open_rir_noise')
|
||||||
|
wav_path = os.path.join(base_path, 'RIRS_NOISES')
|
||||||
|
csv_path = os.path.join(base_path, 'csv')
|
||||||
|
subsets = ['rir', 'noise']
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
subset: str='rir',
|
||||||
|
feat_type: str='raw',
|
||||||
|
target_dir=None,
|
||||||
|
random_chunk: bool=True,
|
||||||
|
chunk_duration: float=3.0,
|
||||||
|
seed: int=0,
|
||||||
|
**kwargs):
|
||||||
|
|
||||||
|
assert subset in self.subsets, \
|
||||||
|
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
|
||||||
|
|
||||||
|
self.subset = subset
|
||||||
|
self.feat_type = feat_type
|
||||||
|
self.feat_config = kwargs
|
||||||
|
self.random_chunk = random_chunk
|
||||||
|
self.chunk_duration = chunk_duration
|
||||||
|
|
||||||
|
OpenRIRNoise.csv_path = os.path.join(
|
||||||
|
target_dir, "open_rir_noise",
|
||||||
|
"csv") if target_dir else self.csv_path
|
||||||
|
self._data = self._get_data()
|
||||||
|
super(OpenRIRNoise, self).__init__()
|
||||||
|
|
||||||
|
# Set up a seed to reproduce training or predicting result.
|
||||||
|
# random.seed(seed)
|
||||||
|
|
||||||
|
def _get_data(self):
|
||||||
|
# Download audio files.
|
||||||
|
print(f"rirs noises base path: {self.base_path}")
|
||||||
|
if not os.path.isdir(self.base_path):
|
||||||
|
download_and_decompress(
|
||||||
|
self.archieves, self.base_path, decompress=True)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"{self.base_path} already exists, we will not download and decompress again"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Data preparation.
|
||||||
|
print(f"prepare the csv to {self.csv_path}")
|
||||||
|
if not os.path.isdir(self.csv_path):
|
||||||
|
os.makedirs(self.csv_path)
|
||||||
|
self.prepare_data()
|
||||||
|
|
||||||
|
data = []
|
||||||
|
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
|
||||||
|
for line in rf.readlines()[1:]:
|
||||||
|
audio_id, duration, wav = line.strip().split(',')
|
||||||
|
data.append(self.meta_info(audio_id, float(duration), wav))
|
||||||
|
|
||||||
|
random.shuffle(data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _convert_to_record(self, idx: int):
|
||||||
|
sample = self._data[idx]
|
||||||
|
|
||||||
|
record = {}
|
||||||
|
# To show all fields in a namedtuple: `type(sample)._fields`
|
||||||
|
for field in type(sample)._fields:
|
||||||
|
record[field] = getattr(sample, field)
|
||||||
|
|
||||||
|
waveform, sr = load_audio(record['wav'])
|
||||||
|
|
||||||
|
assert self.feat_type in feat_funcs.keys(), \
|
||||||
|
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
|
||||||
|
feat_func = feat_funcs[self.feat_type]
|
||||||
|
feat = feat_func(
|
||||||
|
waveform, sr=sr, **self.feat_config) if feat_func else waveform
|
||||||
|
|
||||||
|
record.update({'feat': feat})
|
||||||
|
return record
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_chunks(seg_dur, audio_id, audio_duration):
|
||||||
|
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
|
||||||
|
|
||||||
|
chunk_lst = [
|
||||||
|
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
|
||||||
|
for i in range(num_chunks)
|
||||||
|
]
|
||||||
|
return chunk_lst
|
||||||
|
|
||||||
|
def _get_audio_info(self, wav_file: str,
|
||||||
|
split_chunks: bool) -> List[List[str]]:
|
||||||
|
waveform, sr = load_audio(wav_file)
|
||||||
|
audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
|
||||||
|
audio_duration = waveform.shape[0] / sr
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
if split_chunks and audio_duration > self.chunk_duration: # Split into pieces of self.chunk_duration seconds.
|
||||||
|
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
|
||||||
|
audio_duration)
|
||||||
|
|
||||||
|
for idx, chunk in enumerate(uniq_chunks_list):
|
||||||
|
s, e = chunk.split("_")[-2:] # Timestamps of start and end
|
||||||
|
start_sample = int(float(s) * sr)
|
||||||
|
end_sample = int(float(e) * sr)
|
||||||
|
new_wav_file = os.path.join(self.base_path,
|
||||||
|
audio_id + f'_chunk_{idx+1:02}.wav')
|
||||||
|
save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
|
||||||
|
# id, duration, new_wav
|
||||||
|
ret.append([chunk, self.chunk_duration, new_wav_file])
|
||||||
|
else: # Keep whole audio.
|
||||||
|
ret.append([audio_id, audio_duration, wav_file])
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def generate_csv(self,
|
||||||
|
wav_files: List[str],
|
||||||
|
output_file: str,
|
||||||
|
split_chunks: bool=True):
|
||||||
|
print(f'Generating csv: {output_file}')
|
||||||
|
header = ["id", "duration", "wav"]
|
||||||
|
|
||||||
|
infos = list(
|
||||||
|
tqdm(
|
||||||
|
map(self._get_audio_info, wav_files, [split_chunks] * len(
|
||||||
|
wav_files)),
|
||||||
|
total=len(wav_files)))
|
||||||
|
|
||||||
|
csv_lines = []
|
||||||
|
for info in infos:
|
||||||
|
csv_lines.extend(info)
|
||||||
|
|
||||||
|
with open(output_file, mode="w") as csv_f:
|
||||||
|
csv_writer = csv.writer(
|
||||||
|
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||||
|
csv_writer.writerow(header)
|
||||||
|
for line in csv_lines:
|
||||||
|
csv_writer.writerow(line)
|
||||||
|
|
||||||
|
def prepare_data(self):
|
||||||
|
rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
|
||||||
|
"rir_list")
|
||||||
|
rir_files = []
|
||||||
|
with open(rir_list, 'r') as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
rir_file = line.strip().split(' ')[-1]
|
||||||
|
rir_files.append(os.path.join(self.base_path, rir_file))
|
||||||
|
|
||||||
|
noise_list = os.path.join(self.wav_path, "pointsource_noises",
|
||||||
|
"noise_list")
|
||||||
|
noise_files = []
|
||||||
|
with open(noise_list, 'r') as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
noise_file = line.strip().split(' ')[-1]
|
||||||
|
noise_files.append(os.path.join(self.base_path, noise_file))
|
||||||
|
|
||||||
|
self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
|
||||||
|
self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
return self._convert_to_record(idx)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._data)
|
@ -0,0 +1,126 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['TESS']
|
||||||
|
|
||||||
|
|
||||||
|
class TESS(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
TESS is a set of 200 target words were spoken in the carrier phrase
|
||||||
|
"Say the word _____' by two actresses (aged 26 and 64 years) and
|
||||||
|
recordings were made of the set portraying each of seven emotions(anger,
|
||||||
|
disgust, fear, happiness, pleasant surprise, sadness, and neutral).
|
||||||
|
There are 2800 stimuli in total.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
Toronto emotional speech set (TESS)
|
||||||
|
https://doi.org/10.5683/SP2/E8H2MF
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url':
|
||||||
|
'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
|
||||||
|
'md5':
|
||||||
|
'1465311b24d1de704c4c63e4ccc470c7',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
'angry',
|
||||||
|
'disgust',
|
||||||
|
'fear',
|
||||||
|
'happy',
|
||||||
|
'neutral',
|
||||||
|
'ps', # pleasant surprise
|
||||||
|
'sad',
|
||||||
|
]
|
||||||
|
meta_info = collections.namedtuple('META_INFO',
|
||||||
|
('speaker', 'word', 'emotion'))
|
||||||
|
audio_path = 'TESS_Toronto_emotional_speech_set'
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode='train',
|
||||||
|
seed=0,
|
||||||
|
n_folds=5,
|
||||||
|
split=1,
|
||||||
|
feat_type='raw',
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
seed (:obj:`int`, `optional`, defaults to 0):
|
||||||
|
Set the random seed to shuffle samples.
|
||||||
|
n_folds (:obj:`int`, `optional`, defaults to 5):
|
||||||
|
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
|
||||||
|
split (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
It specify the fold of dev dataset.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
|
||||||
|
files, labels = self._get_data(mode, seed, n_folds, split)
|
||||||
|
super(TESS, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self, files) -> List[collections.namedtuple]:
|
||||||
|
ret = []
|
||||||
|
for file in files:
|
||||||
|
basename_without_extend = os.path.basename(file)[:-4]
|
||||||
|
ret.append(self.meta_info(*basename_without_extend.split('_')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode, seed, n_folds,
|
||||||
|
split) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
wav_files = []
|
||||||
|
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.wav'):
|
||||||
|
wav_files.append(os.path.join(root, file))
|
||||||
|
|
||||||
|
random.seed(seed) # shuffle samples to split data
|
||||||
|
random.shuffle(
|
||||||
|
wav_files
|
||||||
|
) # make sure using the same seed to create train and dev dataset
|
||||||
|
meta_info = self._get_meta_info(wav_files)
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
n_samples_per_fold = len(meta_info) // n_folds
|
||||||
|
for idx, sample in enumerate(meta_info):
|
||||||
|
_, _, emotion = sample
|
||||||
|
target = self.label_list.index(emotion)
|
||||||
|
fold = idx // n_samples_per_fold + 1
|
||||||
|
|
||||||
|
if mode == 'train' and int(fold) != split:
|
||||||
|
files.append(wav_files[idx])
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
if mode != 'train' and int(fold) == split:
|
||||||
|
files.append(wav_files[idx])
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,104 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['UrbanSound8K']
|
||||||
|
|
||||||
|
|
||||||
|
class UrbanSound8K(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
|
||||||
|
sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
|
||||||
|
drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
|
||||||
|
classes are drawn from the urban sound taxonomy.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
A Dataset and Taxonomy for Urban Sound Research
|
||||||
|
https://dl.acm.org/doi/10.1145/2647868.2655045
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url':
|
||||||
|
'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
|
||||||
|
'md5': '9aa69802bbf37fb986f71ec1483a196e',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
"air_conditioner", "car_horn", "children_playing", "dog_bark",
|
||||||
|
"drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
|
||||||
|
"street_music"
|
||||||
|
]
|
||||||
|
meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
|
||||||
|
meta_info = collections.namedtuple(
|
||||||
|
'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
|
||||||
|
'class_id', 'label'))
|
||||||
|
audio_path = os.path.join('UrbanSound8K', 'audio')
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode: str='train',
|
||||||
|
split: int=1,
|
||||||
|
feat_type: str='raw',
|
||||||
|
**kwargs):
|
||||||
|
files, labels = self._get_data(mode, split)
|
||||||
|
super(UrbanSound8K, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
split (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
It specify the fold of dev dataset.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_meta_info(self):
|
||||||
|
ret = []
|
||||||
|
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
|
||||||
|
for line in rf.readlines()[1:]:
|
||||||
|
ret.append(self.meta_info(*line.strip().split(',')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
meta_info = self._get_meta_info()
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
for sample in meta_info:
|
||||||
|
filename, _, _, _, _, fold, target, _ = sample
|
||||||
|
if mode == 'train' and int(fold) != split:
|
||||||
|
files.append(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
|
||||||
|
filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
if mode != 'train' and int(fold) == split:
|
||||||
|
files.append(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
|
||||||
|
filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,356 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import csv
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from multiprocessing import cpu_count
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from paddle.io import Dataset
|
||||||
|
from pathos.multiprocessing import Pool
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from ..backends.soundfile_backend import soundfile_load as load_audio
|
||||||
|
from ..utils import DATA_HOME
|
||||||
|
from ..utils import decompress
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from .dataset import feat_funcs
|
||||||
|
|
||||||
|
__all__ = ['VoxCeleb']
|
||||||
|
|
||||||
|
|
||||||
|
class VoxCeleb(Dataset):
|
||||||
|
source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
|
||||||
|
archieves_audio_dev = [
|
||||||
|
{
|
||||||
|
'url': source_url + 'vox1_dev_wav_partaa',
|
||||||
|
'md5': 'e395d020928bc15670b570a21695ed96',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + 'vox1_dev_wav_partab',
|
||||||
|
'md5': 'bbfaaccefab65d82b21903e81a8a8020',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + 'vox1_dev_wav_partac',
|
||||||
|
'md5': '017d579a2a96a077f40042ec33e51512',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + 'vox1_dev_wav_partad',
|
||||||
|
'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
archieves_audio_test = [
|
||||||
|
{
|
||||||
|
'url': source_url + 'vox1_test_wav.zip',
|
||||||
|
'md5': '185fdc63c3c739954633d50379a3d102',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
archieves_meta = [
|
||||||
|
{
|
||||||
|
'url':
|
||||||
|
'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
|
||||||
|
'md5':
|
||||||
|
'b73110731c9223c1461fe49cb48dddfc',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
|
||||||
|
sample_rate = 16000
|
||||||
|
meta_info = collections.namedtuple(
|
||||||
|
'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
|
||||||
|
base_path = os.path.join(DATA_HOME, 'vox1')
|
||||||
|
wav_path = os.path.join(base_path, 'wav')
|
||||||
|
meta_path = os.path.join(base_path, 'meta')
|
||||||
|
veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
|
||||||
|
csv_path = os.path.join(base_path, 'csv')
|
||||||
|
subsets = ['train', 'dev', 'enroll', 'test']
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
subset: str='train',
|
||||||
|
feat_type: str='raw',
|
||||||
|
random_chunk: bool=True,
|
||||||
|
chunk_duration: float=3.0, # seconds
|
||||||
|
split_ratio: float=0.9, # train split ratio
|
||||||
|
seed: int=0,
|
||||||
|
target_dir: str=None,
|
||||||
|
vox2_base_path=None,
|
||||||
|
**kwargs):
|
||||||
|
"""VoxCeleb data prepare and get the specific dataset audio info
|
||||||
|
|
||||||
|
Args:
|
||||||
|
subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
|
||||||
|
feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
|
||||||
|
random_chunk (bool, optional): random select a duration from audio. Defaults to True.
|
||||||
|
chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
|
||||||
|
target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
|
||||||
|
vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
|
||||||
|
"""
|
||||||
|
assert subset in self.subsets, \
|
||||||
|
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
|
||||||
|
|
||||||
|
self.subset = subset
|
||||||
|
self.spk_id2label = {}
|
||||||
|
self.feat_type = feat_type
|
||||||
|
self.feat_config = kwargs
|
||||||
|
self.random_chunk = random_chunk
|
||||||
|
self.chunk_duration = chunk_duration
|
||||||
|
self.split_ratio = split_ratio
|
||||||
|
self.target_dir = target_dir if target_dir else VoxCeleb.base_path
|
||||||
|
self.vox2_base_path = vox2_base_path
|
||||||
|
|
||||||
|
# if we set the target dir, we will change the vox data info data from base path to target dir
|
||||||
|
VoxCeleb.csv_path = os.path.join(
|
||||||
|
target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
|
||||||
|
VoxCeleb.meta_path = os.path.join(
|
||||||
|
target_dir, "voxceleb",
|
||||||
|
'meta') if target_dir else VoxCeleb.meta_path
|
||||||
|
VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
|
||||||
|
'veri_test2.txt')
|
||||||
|
# self._data = self._get_data()[:1000] # KP: Small dataset test.
|
||||||
|
self._data = self._get_data()
|
||||||
|
super(VoxCeleb, self).__init__()
|
||||||
|
|
||||||
|
# Set up a seed to reproduce training or predicting result.
|
||||||
|
# random.seed(seed)
|
||||||
|
|
||||||
|
def _get_data(self):
|
||||||
|
# Download audio files.
|
||||||
|
# We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
|
||||||
|
# so, we check the vox1/wav dir status
|
||||||
|
print(f"wav base path: {self.wav_path}")
|
||||||
|
if not os.path.isdir(self.wav_path):
|
||||||
|
print("start to download the voxceleb1 dataset")
|
||||||
|
download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip
|
||||||
|
self.archieves_audio_dev,
|
||||||
|
self.base_path,
|
||||||
|
decompress=False)
|
||||||
|
download_and_decompress( # download the vox1_test_wav.zip and unzip
|
||||||
|
self.archieves_audio_test,
|
||||||
|
self.base_path,
|
||||||
|
decompress=True)
|
||||||
|
|
||||||
|
# Download all parts and concatenate the files into one zip file.
|
||||||
|
dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
|
||||||
|
print(f'Concatenating all parts to: {dev_zipfile}')
|
||||||
|
os.system(
|
||||||
|
f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract all audio files of dev and test set.
|
||||||
|
decompress(dev_zipfile, self.base_path)
|
||||||
|
|
||||||
|
# Download meta files.
|
||||||
|
if not os.path.isdir(self.meta_path):
|
||||||
|
print("prepare the meta data")
|
||||||
|
download_and_decompress(
|
||||||
|
self.archieves_meta, self.meta_path, decompress=False)
|
||||||
|
|
||||||
|
# Data preparation.
|
||||||
|
if not os.path.isdir(self.csv_path):
|
||||||
|
os.makedirs(self.csv_path)
|
||||||
|
self.prepare_data()
|
||||||
|
|
||||||
|
data = []
|
||||||
|
print(
|
||||||
|
f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
|
||||||
|
)
|
||||||
|
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
|
||||||
|
for line in rf.readlines()[1:]:
|
||||||
|
audio_id, duration, wav, start, stop, spk_id = line.strip(
|
||||||
|
).split(',')
|
||||||
|
data.append(
|
||||||
|
self.meta_info(audio_id,
|
||||||
|
float(duration), wav,
|
||||||
|
int(start), int(stop), spk_id))
|
||||||
|
|
||||||
|
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
spk_id, label = line.strip().split(' ')
|
||||||
|
self.spk_id2label[spk_id] = int(label)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _convert_to_record(self, idx: int):
|
||||||
|
sample = self._data[idx]
|
||||||
|
|
||||||
|
record = {}
|
||||||
|
# To show all fields in a namedtuple: `type(sample)._fields`
|
||||||
|
for field in type(sample)._fields:
|
||||||
|
record[field] = getattr(sample, field)
|
||||||
|
|
||||||
|
waveform, sr = load_audio(record['wav'])
|
||||||
|
|
||||||
|
# random select a chunk audio samples from the audio
|
||||||
|
if self.random_chunk:
|
||||||
|
num_wav_samples = waveform.shape[0]
|
||||||
|
num_chunk_samples = int(self.chunk_duration * sr)
|
||||||
|
start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
|
||||||
|
stop = start + num_chunk_samples
|
||||||
|
else:
|
||||||
|
start = record['start']
|
||||||
|
stop = record['stop']
|
||||||
|
|
||||||
|
waveform = waveform[start:stop]
|
||||||
|
|
||||||
|
assert self.feat_type in feat_funcs.keys(), \
|
||||||
|
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
|
||||||
|
feat_func = feat_funcs[self.feat_type]
|
||||||
|
feat = feat_func(
|
||||||
|
waveform, sr=sr, **self.feat_config) if feat_func else waveform
|
||||||
|
|
||||||
|
record.update({'feat': feat})
|
||||||
|
if self.subset in ['train',
|
||||||
|
'dev']: # Labels are available in train and dev.
|
||||||
|
record.update({'label': self.spk_id2label[record['spk_id']]})
|
||||||
|
|
||||||
|
return record
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_chunks(seg_dur, audio_id, audio_duration):
|
||||||
|
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
|
||||||
|
|
||||||
|
chunk_lst = [
|
||||||
|
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
|
||||||
|
for i in range(num_chunks)
|
||||||
|
]
|
||||||
|
return chunk_lst
|
||||||
|
|
||||||
|
def _get_audio_info(self, wav_file: str,
|
||||||
|
split_chunks: bool) -> List[List[str]]:
|
||||||
|
waveform, sr = load_audio(wav_file)
|
||||||
|
spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
|
||||||
|
audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
|
||||||
|
audio_duration = waveform.shape[0] / sr
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
if split_chunks: # Split into pieces of self.chunk_duration seconds.
|
||||||
|
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
|
||||||
|
audio_duration)
|
||||||
|
|
||||||
|
for chunk in uniq_chunks_list:
|
||||||
|
s, e = chunk.split("_")[-2:] # Timestamps of start and end
|
||||||
|
start_sample = int(float(s) * sr)
|
||||||
|
end_sample = int(float(e) * sr)
|
||||||
|
# id, duration, wav, start, stop, spk_id
|
||||||
|
ret.append([
|
||||||
|
chunk, audio_duration, wav_file, start_sample, end_sample,
|
||||||
|
spk_id
|
||||||
|
])
|
||||||
|
else: # Keep whole audio.
|
||||||
|
ret.append([
|
||||||
|
audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
|
||||||
|
])
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def generate_csv(self,
|
||||||
|
wav_files: List[str],
|
||||||
|
output_file: str,
|
||||||
|
split_chunks: bool=True):
|
||||||
|
print(f'Generating csv: {output_file}')
|
||||||
|
header = ["id", "duration", "wav", "start", "stop", "spk_id"]
|
||||||
|
# Note: this may occurs c++ execption, but the program will execute fine
|
||||||
|
# so we can ignore the execption
|
||||||
|
with Pool(cpu_count()) as p:
|
||||||
|
infos = list(
|
||||||
|
tqdm(
|
||||||
|
p.imap(lambda x: self._get_audio_info(x, split_chunks),
|
||||||
|
wav_files),
|
||||||
|
total=len(wav_files)))
|
||||||
|
|
||||||
|
csv_lines = []
|
||||||
|
for info in infos:
|
||||||
|
csv_lines.extend(info)
|
||||||
|
|
||||||
|
with open(output_file, mode="w") as csv_f:
|
||||||
|
csv_writer = csv.writer(
|
||||||
|
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
|
||||||
|
csv_writer.writerow(header)
|
||||||
|
for line in csv_lines:
|
||||||
|
csv_writer.writerow(line)
|
||||||
|
|
||||||
|
def prepare_data(self):
|
||||||
|
# Audio of speakers in veri_test_file should not be included in training set.
|
||||||
|
print("start to prepare the data csv file")
|
||||||
|
enroll_files = set()
|
||||||
|
test_files = set()
|
||||||
|
# get the enroll and test audio file path
|
||||||
|
with open(self.veri_test_file, 'r') as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
_, enrol_file, test_file = line.strip().split(' ')
|
||||||
|
enroll_files.add(os.path.join(self.wav_path, enrol_file))
|
||||||
|
test_files.add(os.path.join(self.wav_path, test_file))
|
||||||
|
enroll_files = sorted(enroll_files)
|
||||||
|
test_files = sorted(test_files)
|
||||||
|
|
||||||
|
# get the enroll and test speakers
|
||||||
|
test_spks = set()
|
||||||
|
for file in (enroll_files + test_files):
|
||||||
|
spk = file.split('/wav/')[1].split('/')[0]
|
||||||
|
test_spks.add(spk)
|
||||||
|
|
||||||
|
# get all the train and dev audios file path
|
||||||
|
audio_files = []
|
||||||
|
speakers = set()
|
||||||
|
print("Getting file list...")
|
||||||
|
for path in [self.wav_path, self.vox2_base_path]:
|
||||||
|
# if vox2 directory is not set and vox2 is not a directory
|
||||||
|
# we will not process this directory
|
||||||
|
if not path or not os.path.exists(path):
|
||||||
|
print(f"{path} is an invalid path, please check again, "
|
||||||
|
"and we will ignore the vox2 base path")
|
||||||
|
continue
|
||||||
|
for file in glob.glob(
|
||||||
|
os.path.join(path, "**", "*.wav"), recursive=True):
|
||||||
|
spk = file.split('/wav/')[1].split('/')[0]
|
||||||
|
if spk in test_spks:
|
||||||
|
continue
|
||||||
|
speakers.add(spk)
|
||||||
|
audio_files.append(file)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
|
||||||
|
)
|
||||||
|
# encode the train and dev speakers label to spk_id2label.txt
|
||||||
|
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
|
||||||
|
for label, spk_id in enumerate(
|
||||||
|
sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2
|
||||||
|
f.write(f'{spk_id} {label}\n')
|
||||||
|
|
||||||
|
audio_files = sorted(audio_files)
|
||||||
|
random.shuffle(audio_files)
|
||||||
|
split_idx = int(self.split_ratio * len(audio_files))
|
||||||
|
# split_ratio to train
|
||||||
|
train_files, dev_files = audio_files[:split_idx], audio_files[
|
||||||
|
split_idx:]
|
||||||
|
|
||||||
|
self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
|
||||||
|
self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
|
||||||
|
|
||||||
|
self.generate_csv(
|
||||||
|
enroll_files,
|
||||||
|
os.path.join(self.csv_path, 'enroll.csv'),
|
||||||
|
split_chunks=False)
|
||||||
|
self.generate_csv(
|
||||||
|
test_files,
|
||||||
|
os.path.join(self.csv_path, 'test.csv'),
|
||||||
|
split_chunks=False)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
return self._convert_to_record(idx)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._data)
|
@ -0,0 +1,17 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .layers import LogMelSpectrogram
|
||||||
|
from .layers import MelSpectrogram
|
||||||
|
from .layers import MFCC
|
||||||
|
from .layers import Spectrogram
|
@ -0,0 +1,328 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from functools import partial
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddle.nn as nn
|
||||||
|
from paddle import Tensor
|
||||||
|
|
||||||
|
from ..functional import compute_fbank_matrix
|
||||||
|
from ..functional import create_dct
|
||||||
|
from ..functional import power_to_db
|
||||||
|
from ..functional.window import get_window
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'Spectrogram',
|
||||||
|
'MelSpectrogram',
|
||||||
|
'LogMelSpectrogram',
|
||||||
|
'MFCC',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class Spectrogram(nn.Layer):
|
||||||
|
"""Compute spectrogram of given signals, typically audio waveforms.
|
||||||
|
The spectorgram is defined as the complex norm of the short-time Fourier transformation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
|
||||||
|
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
|
||||||
|
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
|
||||||
|
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
|
||||||
|
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
|
||||||
|
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||||
|
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
|
||||||
|
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
n_fft: int=512,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str='hann',
|
||||||
|
power: float=2.0,
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
dtype: str='float32') -> None:
|
||||||
|
super(Spectrogram, self).__init__()
|
||||||
|
|
||||||
|
assert power > 0, 'Power of spectrogram must be > 0.'
|
||||||
|
self.power = power
|
||||||
|
|
||||||
|
if win_length is None:
|
||||||
|
win_length = n_fft
|
||||||
|
|
||||||
|
self.fft_window = get_window(
|
||||||
|
window, win_length, fftbins=True, dtype=dtype)
|
||||||
|
self._stft = partial(
|
||||||
|
paddle.signal.stft,
|
||||||
|
n_fft=n_fft,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=win_length,
|
||||||
|
window=self.fft_window,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode)
|
||||||
|
self.register_buffer('fft_window', self.fft_window)
|
||||||
|
|
||||||
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (Tensor): Tensor of waveforms with shape `(N, T)`
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
|
||||||
|
"""
|
||||||
|
stft = self._stft(x)
|
||||||
|
spectrogram = paddle.pow(paddle.abs(stft), self.power)
|
||||||
|
return spectrogram
|
||||||
|
|
||||||
|
|
||||||
|
class MelSpectrogram(nn.Layer):
|
||||||
|
"""Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sr (int, optional): Sample rate. Defaults to 22050.
|
||||||
|
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
|
||||||
|
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
|
||||||
|
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
|
||||||
|
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
|
||||||
|
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
|
||||||
|
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||||
|
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
|
||||||
|
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||||
|
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
|
||||||
|
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||||
|
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
|
||||||
|
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
|
||||||
|
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
sr: int=22050,
|
||||||
|
n_fft: int=512,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str='hann',
|
||||||
|
power: float=2.0,
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
n_mels: int=64,
|
||||||
|
f_min: float=50.0,
|
||||||
|
f_max: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: Union[str, float]='slaney',
|
||||||
|
dtype: str='float32') -> None:
|
||||||
|
super(MelSpectrogram, self).__init__()
|
||||||
|
|
||||||
|
self._spectrogram = Spectrogram(
|
||||||
|
n_fft=n_fft,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=win_length,
|
||||||
|
window=window,
|
||||||
|
power=power,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode,
|
||||||
|
dtype=dtype)
|
||||||
|
self.n_mels = n_mels
|
||||||
|
self.f_min = f_min
|
||||||
|
self.f_max = f_max
|
||||||
|
self.htk = htk
|
||||||
|
self.norm = norm
|
||||||
|
if f_max is None:
|
||||||
|
f_max = sr // 2
|
||||||
|
self.fbank_matrix = compute_fbank_matrix(
|
||||||
|
sr=sr,
|
||||||
|
n_fft=n_fft,
|
||||||
|
n_mels=n_mels,
|
||||||
|
f_min=f_min,
|
||||||
|
f_max=f_max,
|
||||||
|
htk=htk,
|
||||||
|
norm=norm,
|
||||||
|
dtype=dtype) # float64 for better numerical results
|
||||||
|
self.register_buffer('fbank_matrix', self.fbank_matrix)
|
||||||
|
|
||||||
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (Tensor): Tensor of waveforms with shape `(N, T)`
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
|
||||||
|
"""
|
||||||
|
spect_feature = self._spectrogram(x)
|
||||||
|
mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
|
||||||
|
return mel_feature
|
||||||
|
|
||||||
|
|
||||||
|
class LogMelSpectrogram(nn.Layer):
|
||||||
|
"""Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sr (int, optional): Sample rate. Defaults to 22050.
|
||||||
|
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
|
||||||
|
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
|
||||||
|
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
|
||||||
|
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
|
||||||
|
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
|
||||||
|
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||||
|
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
|
||||||
|
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||||
|
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
|
||||||
|
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||||
|
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
|
||||||
|
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
|
||||||
|
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||||
|
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
|
||||||
|
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
|
||||||
|
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
sr: int=22050,
|
||||||
|
n_fft: int=512,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str='hann',
|
||||||
|
power: float=2.0,
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
n_mels: int=64,
|
||||||
|
f_min: float=50.0,
|
||||||
|
f_max: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: Union[str, float]='slaney',
|
||||||
|
ref_value: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=None,
|
||||||
|
dtype: str='float32') -> None:
|
||||||
|
super(LogMelSpectrogram, self).__init__()
|
||||||
|
|
||||||
|
self._melspectrogram = MelSpectrogram(
|
||||||
|
sr=sr,
|
||||||
|
n_fft=n_fft,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=win_length,
|
||||||
|
window=window,
|
||||||
|
power=power,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode,
|
||||||
|
n_mels=n_mels,
|
||||||
|
f_min=f_min,
|
||||||
|
f_max=f_max,
|
||||||
|
htk=htk,
|
||||||
|
norm=norm,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
self.ref_value = ref_value
|
||||||
|
self.amin = amin
|
||||||
|
self.top_db = top_db
|
||||||
|
|
||||||
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (Tensor): Tensor of waveforms with shape `(N, T)`
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
|
||||||
|
"""
|
||||||
|
mel_feature = self._melspectrogram(x)
|
||||||
|
log_mel_feature = power_to_db(
|
||||||
|
mel_feature,
|
||||||
|
ref_value=self.ref_value,
|
||||||
|
amin=self.amin,
|
||||||
|
top_db=self.top_db)
|
||||||
|
return log_mel_feature
|
||||||
|
|
||||||
|
|
||||||
|
class MFCC(nn.Layer):
|
||||||
|
"""Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sr (int, optional): Sample rate. Defaults to 22050.
|
||||||
|
n_mfcc (int, optional): [description]. Defaults to 40.
|
||||||
|
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
|
||||||
|
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
|
||||||
|
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
|
||||||
|
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
|
||||||
|
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
|
||||||
|
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
|
||||||
|
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
|
||||||
|
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||||
|
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
|
||||||
|
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||||
|
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
|
||||||
|
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
|
||||||
|
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||||
|
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
|
||||||
|
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
|
||||||
|
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
sr: int=22050,
|
||||||
|
n_mfcc: int=40,
|
||||||
|
n_fft: int=512,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str='hann',
|
||||||
|
power: float=2.0,
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
n_mels: int=64,
|
||||||
|
f_min: float=50.0,
|
||||||
|
f_max: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: Union[str, float]='slaney',
|
||||||
|
ref_value: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=None,
|
||||||
|
dtype: str=paddle.float32) -> None:
|
||||||
|
super(MFCC, self).__init__()
|
||||||
|
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
|
||||||
|
n_mfcc, n_mels)
|
||||||
|
self._log_melspectrogram = LogMelSpectrogram(
|
||||||
|
sr=sr,
|
||||||
|
n_fft=n_fft,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=win_length,
|
||||||
|
window=window,
|
||||||
|
power=power,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode,
|
||||||
|
n_mels=n_mels,
|
||||||
|
f_min=f_min,
|
||||||
|
f_max=f_max,
|
||||||
|
htk=htk,
|
||||||
|
norm=norm,
|
||||||
|
ref_value=ref_value,
|
||||||
|
amin=amin,
|
||||||
|
top_db=top_db,
|
||||||
|
dtype=dtype)
|
||||||
|
self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
|
||||||
|
self.register_buffer('dct_matrix', self.dct_matrix)
|
||||||
|
|
||||||
|
def forward(self, x: Tensor) -> Tensor:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (Tensor): Tensor of waveforms with shape `(N, T)`
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
|
||||||
|
"""
|
||||||
|
log_mel_feature = self._log_melspectrogram(x)
|
||||||
|
mfcc = paddle.matmul(
|
||||||
|
log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
|
||||||
|
(0, 2, 1)) # (B, n_mels, L)
|
||||||
|
return mfcc
|
@ -0,0 +1,20 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .functional import compute_fbank_matrix
|
||||||
|
from .functional import create_dct
|
||||||
|
from .functional import fft_frequencies
|
||||||
|
from .functional import hz_to_mel
|
||||||
|
from .functional import mel_frequencies
|
||||||
|
from .functional import mel_to_hz
|
||||||
|
from .functional import power_to_db
|
@ -0,0 +1,266 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from librosa(https://github.com/librosa/librosa)
|
||||||
|
import math
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import Tensor
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'hz_to_mel',
|
||||||
|
'mel_to_hz',
|
||||||
|
'mel_frequencies',
|
||||||
|
'fft_frequencies',
|
||||||
|
'compute_fbank_matrix',
|
||||||
|
'power_to_db',
|
||||||
|
'create_dct',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def hz_to_mel(freq: Union[Tensor, float],
|
||||||
|
htk: bool=False) -> Union[Tensor, float]:
|
||||||
|
"""Convert Hz to Mels.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
freq (Union[Tensor, float]): The input tensor with arbitrary shape.
|
||||||
|
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Union[Tensor, float]: Frequency in mels.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if htk:
|
||||||
|
if isinstance(freq, Tensor):
|
||||||
|
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
|
||||||
|
else:
|
||||||
|
return 2595.0 * math.log10(1.0 + freq / 700.0)
|
||||||
|
|
||||||
|
# Fill in the linear part
|
||||||
|
f_min = 0.0
|
||||||
|
f_sp = 200.0 / 3
|
||||||
|
|
||||||
|
mels = (freq - f_min) / f_sp
|
||||||
|
|
||||||
|
# Fill in the log-scale part
|
||||||
|
|
||||||
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||||
|
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||||
|
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||||
|
|
||||||
|
if isinstance(freq, Tensor):
|
||||||
|
target = min_log_mel + paddle.log(
|
||||||
|
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
|
||||||
|
mask = (freq > min_log_hz).astype(freq.dtype)
|
||||||
|
mels = target * mask + mels * (
|
||||||
|
1 - mask) # will replace by masked_fill OP in future
|
||||||
|
else:
|
||||||
|
if freq >= min_log_hz:
|
||||||
|
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
|
||||||
|
|
||||||
|
return mels
|
||||||
|
|
||||||
|
|
||||||
|
def mel_to_hz(mel: Union[float, Tensor],
|
||||||
|
htk: bool=False) -> Union[float, Tensor]:
|
||||||
|
"""Convert mel bin numbers to frequencies.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
|
||||||
|
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Union[float, Tensor]: Frequencies in Hz.
|
||||||
|
"""
|
||||||
|
if htk:
|
||||||
|
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
|
||||||
|
|
||||||
|
f_min = 0.0
|
||||||
|
f_sp = 200.0 / 3
|
||||||
|
freqs = f_min + f_sp * mel
|
||||||
|
# And now the nonlinear scale
|
||||||
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||||
|
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||||
|
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||||
|
if isinstance(mel, Tensor):
|
||||||
|
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
|
||||||
|
mask = (mel > min_log_mel).astype(mel.dtype)
|
||||||
|
freqs = target * mask + freqs * (
|
||||||
|
1 - mask) # will replace by masked_fill OP in future
|
||||||
|
else:
|
||||||
|
if mel >= min_log_mel:
|
||||||
|
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
|
||||||
|
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
|
def mel_frequencies(n_mels: int=64,
|
||||||
|
f_min: float=0.0,
|
||||||
|
f_max: float=11025.0,
|
||||||
|
htk: bool=False,
|
||||||
|
dtype: str='float32') -> Tensor:
|
||||||
|
"""Compute mel frequencies.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||||
|
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
|
||||||
|
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
|
||||||
|
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||||
|
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
|
||||||
|
"""
|
||||||
|
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||||
|
min_mel = hz_to_mel(f_min, htk=htk)
|
||||||
|
max_mel = hz_to_mel(f_max, htk=htk)
|
||||||
|
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
|
||||||
|
freqs = mel_to_hz(mels, htk=htk)
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
|
def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
|
||||||
|
"""Compute fourier frequencies.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sr (int): Sample rate.
|
||||||
|
n_fft (int): Number of fft bins.
|
||||||
|
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
|
||||||
|
"""
|
||||||
|
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_matrix(sr: int,
|
||||||
|
n_fft: int,
|
||||||
|
n_mels: int=64,
|
||||||
|
f_min: float=0.0,
|
||||||
|
f_max: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: Union[str, float]='slaney',
|
||||||
|
dtype: str='float32') -> Tensor:
|
||||||
|
"""Compute fbank matrix.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sr (int): Sample rate.
|
||||||
|
n_fft (int): Number of fft bins.
|
||||||
|
n_mels (int, optional): Number of mel bins. Defaults to 64.
|
||||||
|
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
|
||||||
|
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
|
||||||
|
htk (bool, optional): Use htk scaling. Defaults to False.
|
||||||
|
norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
|
||||||
|
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if f_max is None:
|
||||||
|
f_max = float(sr) / 2
|
||||||
|
|
||||||
|
# Initialize the weights
|
||||||
|
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
||||||
|
|
||||||
|
# Center freqs of each FFT bin
|
||||||
|
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
|
||||||
|
|
||||||
|
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||||
|
mel_f = mel_frequencies(
|
||||||
|
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
|
||||||
|
|
||||||
|
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
|
||||||
|
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
|
||||||
|
#ramps = np.subtract.outer(mel_f, fftfreqs)
|
||||||
|
|
||||||
|
for i in range(n_mels):
|
||||||
|
# lower and upper slopes for all bins
|
||||||
|
lower = -ramps[i] / fdiff[i]
|
||||||
|
upper = ramps[i + 2] / fdiff[i + 1]
|
||||||
|
|
||||||
|
# .. then intersect them with each other and zero
|
||||||
|
weights[i] = paddle.maximum(
|
||||||
|
paddle.zeros_like(lower), paddle.minimum(lower, upper))
|
||||||
|
|
||||||
|
# Slaney-style mel is scaled to be approx constant energy per channel
|
||||||
|
if norm == 'slaney':
|
||||||
|
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
|
||||||
|
weights *= enorm.unsqueeze(1)
|
||||||
|
elif isinstance(norm, int) or isinstance(norm, float):
|
||||||
|
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
|
||||||
|
|
||||||
|
return weights
|
||||||
|
|
||||||
|
|
||||||
|
def power_to_db(spect: Tensor,
|
||||||
|
ref_value: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=None) -> Tensor:
|
||||||
|
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spect (Tensor): STFT power spectrogram.
|
||||||
|
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
|
||||||
|
amin (float, optional): Minimum threshold. Defaults to 1e-10.
|
||||||
|
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Power spectrogram in db scale.
|
||||||
|
"""
|
||||||
|
if amin <= 0:
|
||||||
|
raise Exception("amin must be strictly positive")
|
||||||
|
|
||||||
|
if ref_value <= 0:
|
||||||
|
raise Exception("ref_value must be strictly positive")
|
||||||
|
|
||||||
|
ones = paddle.ones_like(spect)
|
||||||
|
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
|
||||||
|
log_spec -= 10.0 * math.log10(max(ref_value, amin))
|
||||||
|
|
||||||
|
if top_db is not None:
|
||||||
|
if top_db < 0:
|
||||||
|
raise Exception("top_db must be non-negative")
|
||||||
|
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
|
||||||
|
|
||||||
|
return log_spec
|
||||||
|
|
||||||
|
|
||||||
|
def create_dct(n_mfcc: int,
|
||||||
|
n_mels: int,
|
||||||
|
norm: Optional[str]='ortho',
|
||||||
|
dtype: str='float32') -> Tensor:
|
||||||
|
"""Create a discrete cosine transform(DCT) matrix.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n_mfcc (int): Number of mel frequency cepstral coefficients.
|
||||||
|
n_mels (int): Number of mel filterbanks.
|
||||||
|
norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
|
||||||
|
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
|
||||||
|
"""
|
||||||
|
n = paddle.arange(n_mels, dtype=dtype)
|
||||||
|
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
|
||||||
|
dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
|
||||||
|
k) # size (n_mfcc, n_mels)
|
||||||
|
if norm is None:
|
||||||
|
dct *= 2.0
|
||||||
|
else:
|
||||||
|
assert norm == "ortho"
|
||||||
|
dct[0] *= 1.0 / math.sqrt(2.0)
|
||||||
|
dct *= math.sqrt(2.0 / float(n_mels))
|
||||||
|
return dct.T
|
@ -0,0 +1,337 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
import math
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import Tensor
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'get_window',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _cat(x: List[Tensor], data_type: str) -> Tensor:
|
||||||
|
l = [paddle.to_tensor(_, data_type) for _ in x]
|
||||||
|
return paddle.concat(l)
|
||||||
|
|
||||||
|
|
||||||
|
def _acosh(x: Union[Tensor, float]) -> Tensor:
|
||||||
|
if isinstance(x, float):
|
||||||
|
return math.log(x + math.sqrt(x**2 - 1))
|
||||||
|
return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
|
||||||
|
|
||||||
|
|
||||||
|
def _extend(M: int, sym: bool) -> bool:
|
||||||
|
"""Extend window by 1 sample if needed for DFT-even symmetry. """
|
||||||
|
if not sym:
|
||||||
|
return M + 1, True
|
||||||
|
else:
|
||||||
|
return M, False
|
||||||
|
|
||||||
|
|
||||||
|
def _len_guards(M: int) -> bool:
|
||||||
|
"""Handle small or incorrect window lengths. """
|
||||||
|
if int(M) != M or M < 0:
|
||||||
|
raise ValueError('Window length M must be a non-negative integer')
|
||||||
|
|
||||||
|
return M <= 1
|
||||||
|
|
||||||
|
|
||||||
|
def _truncate(w: Tensor, needed: bool) -> Tensor:
|
||||||
|
"""Truncate window by 1 sample if needed for DFT-even symmetry. """
|
||||||
|
if needed:
|
||||||
|
return w[:-1]
|
||||||
|
else:
|
||||||
|
return w
|
||||||
|
|
||||||
|
|
||||||
|
def _general_gaussian(M: int, p, sig, sym: bool=True,
|
||||||
|
dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a window with a generalized Gaussian shape.
|
||||||
|
This function is consistent with scipy.signal.windows.general_gaussian().
|
||||||
|
"""
|
||||||
|
if _len_guards(M):
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
M, needs_trunc = _extend(M, sym)
|
||||||
|
|
||||||
|
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
|
||||||
|
w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
|
||||||
|
|
||||||
|
return _truncate(w, needs_trunc)
|
||||||
|
|
||||||
|
|
||||||
|
def _general_cosine(M: int, a: float, sym: bool=True,
|
||||||
|
dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a generic weighted sum of cosine terms window.
|
||||||
|
This function is consistent with scipy.signal.windows.general_cosine().
|
||||||
|
"""
|
||||||
|
if _len_guards(M):
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
M, needs_trunc = _extend(M, sym)
|
||||||
|
fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
|
||||||
|
w = paddle.zeros((M, ), dtype=dtype)
|
||||||
|
for k in range(len(a)):
|
||||||
|
w += a[k] * paddle.cos(k * fac)
|
||||||
|
return _truncate(w, needs_trunc)
|
||||||
|
|
||||||
|
|
||||||
|
def _general_hamming(M: int, alpha: float, sym: bool=True,
|
||||||
|
dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a generalized Hamming window.
|
||||||
|
This function is consistent with scipy.signal.windows.general_hamming()
|
||||||
|
"""
|
||||||
|
return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def _taylor(M: int,
|
||||||
|
nbar=4,
|
||||||
|
sll=30,
|
||||||
|
norm=True,
|
||||||
|
sym: bool=True,
|
||||||
|
dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a Taylor window.
|
||||||
|
The Taylor window taper function approximates the Dolph-Chebyshev window's
|
||||||
|
constant sidelobe level for a parameterized number of near-in sidelobes.
|
||||||
|
"""
|
||||||
|
if _len_guards(M):
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
M, needs_trunc = _extend(M, sym)
|
||||||
|
# Original text uses a negative sidelobe level parameter and then negates
|
||||||
|
# it in the calculation of B. To keep consistent with other methods we
|
||||||
|
# assume the sidelobe level parameter to be positive.
|
||||||
|
B = 10**(sll / 20)
|
||||||
|
A = _acosh(B) / math.pi
|
||||||
|
s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
|
||||||
|
ma = paddle.arange(1, nbar, dtype=dtype)
|
||||||
|
|
||||||
|
Fm = paddle.empty((nbar - 1, ), dtype=dtype)
|
||||||
|
signs = paddle.empty_like(ma)
|
||||||
|
signs[::2] = 1
|
||||||
|
signs[1::2] = -1
|
||||||
|
m2 = ma * ma
|
||||||
|
for mi in range(len(ma)):
|
||||||
|
numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
|
||||||
|
))
|
||||||
|
if mi == 0:
|
||||||
|
denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
|
||||||
|
elif mi == len(ma) - 1:
|
||||||
|
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
|
||||||
|
else:
|
||||||
|
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
|
||||||
|
mi] / m2[mi + 1:])
|
||||||
|
|
||||||
|
Fm[mi] = numer / denom
|
||||||
|
|
||||||
|
def W(n):
|
||||||
|
return 1 + 2 * paddle.matmul(
|
||||||
|
Fm.unsqueeze(0),
|
||||||
|
paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
|
||||||
|
|
||||||
|
w = W(paddle.arange(0, M, dtype=dtype))
|
||||||
|
|
||||||
|
# normalize (Note that this is not described in the original text [1])
|
||||||
|
if norm:
|
||||||
|
scale = 1.0 / W((M - 1) / 2)
|
||||||
|
w *= scale
|
||||||
|
w = w.squeeze()
|
||||||
|
return _truncate(w, needs_trunc)
|
||||||
|
|
||||||
|
|
||||||
|
def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a Hamming window.
|
||||||
|
The Hamming window is a taper formed by using a raised cosine with
|
||||||
|
non-zero endpoints, optimized to minimize the nearest side lobe.
|
||||||
|
"""
|
||||||
|
return _general_hamming(M, 0.54, sym, dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a Hann window.
|
||||||
|
The Hann window is a taper formed by using a raised cosine or sine-squared
|
||||||
|
with ends that touch zero.
|
||||||
|
"""
|
||||||
|
return _general_hamming(M, 0.5, sym, dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a Tukey window.
|
||||||
|
The Tukey window is also known as a tapered cosine window.
|
||||||
|
"""
|
||||||
|
if _len_guards(M):
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
|
||||||
|
if alpha <= 0:
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
elif alpha >= 1.0:
|
||||||
|
return hann(M, sym=sym)
|
||||||
|
|
||||||
|
M, needs_trunc = _extend(M, sym)
|
||||||
|
|
||||||
|
n = paddle.arange(0, M, dtype=dtype)
|
||||||
|
width = int(alpha * (M - 1) / 2.0)
|
||||||
|
n1 = n[0:width + 1]
|
||||||
|
n2 = n[width + 1:M - width - 1]
|
||||||
|
n3 = n[M - width - 1:]
|
||||||
|
|
||||||
|
w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
|
||||||
|
w2 = paddle.ones(n2.shape, dtype=dtype)
|
||||||
|
w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
|
||||||
|
(M - 1))))
|
||||||
|
w = paddle.concat([w1, w2, w3])
|
||||||
|
|
||||||
|
return _truncate(w, needs_trunc)
|
||||||
|
|
||||||
|
|
||||||
|
def _kaiser(M: int, beta: float, sym: bool=True,
|
||||||
|
dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a Kaiser window.
|
||||||
|
The Kaiser window is a taper formed by using a Bessel function.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def _gaussian(M: int, std: float, sym: bool=True,
|
||||||
|
dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a Gaussian window.
|
||||||
|
The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
|
||||||
|
"""
|
||||||
|
if _len_guards(M):
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
M, needs_trunc = _extend(M, sym)
|
||||||
|
|
||||||
|
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
|
||||||
|
sig2 = 2 * std * std
|
||||||
|
w = paddle.exp(-n**2 / sig2)
|
||||||
|
|
||||||
|
return _truncate(w, needs_trunc)
|
||||||
|
|
||||||
|
|
||||||
|
def _exponential(M: int,
|
||||||
|
center=None,
|
||||||
|
tau=1.,
|
||||||
|
sym: bool=True,
|
||||||
|
dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute an exponential (or Poisson) window. """
|
||||||
|
if sym and center is not None:
|
||||||
|
raise ValueError("If sym==True, center must be None.")
|
||||||
|
if _len_guards(M):
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
M, needs_trunc = _extend(M, sym)
|
||||||
|
|
||||||
|
if center is None:
|
||||||
|
center = (M - 1) / 2
|
||||||
|
|
||||||
|
n = paddle.arange(0, M, dtype=dtype)
|
||||||
|
w = paddle.exp(-paddle.abs(n - center) / tau)
|
||||||
|
|
||||||
|
return _truncate(w, needs_trunc)
|
||||||
|
|
||||||
|
|
||||||
|
def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a triangular window.
|
||||||
|
"""
|
||||||
|
if _len_guards(M):
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
M, needs_trunc = _extend(M, sym)
|
||||||
|
|
||||||
|
n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
|
||||||
|
if M % 2 == 0:
|
||||||
|
w = (2 * n - 1.0) / M
|
||||||
|
w = paddle.concat([w, w[::-1]])
|
||||||
|
else:
|
||||||
|
w = 2 * n / (M + 1.0)
|
||||||
|
w = paddle.concat([w, w[-2::-1]])
|
||||||
|
|
||||||
|
return _truncate(w, needs_trunc)
|
||||||
|
|
||||||
|
|
||||||
|
def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a Bohman window.
|
||||||
|
The Bohman window is the autocorrelation of a cosine window.
|
||||||
|
"""
|
||||||
|
if _len_guards(M):
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
M, needs_trunc = _extend(M, sym)
|
||||||
|
|
||||||
|
fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
|
||||||
|
w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
|
||||||
|
math.pi * fac)
|
||||||
|
w = _cat([0, w, 0], dtype)
|
||||||
|
|
||||||
|
return _truncate(w, needs_trunc)
|
||||||
|
|
||||||
|
|
||||||
|
def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a Blackman window.
|
||||||
|
The Blackman window is a taper formed by using the first three terms of
|
||||||
|
a summation of cosines. It was designed to have close to the minimal
|
||||||
|
leakage possible. It is close to optimal, only slightly worse than a
|
||||||
|
Kaiser window.
|
||||||
|
"""
|
||||||
|
return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
|
||||||
|
"""Compute a window with a simple cosine shape.
|
||||||
|
"""
|
||||||
|
if _len_guards(M):
|
||||||
|
return paddle.ones((M, ), dtype=dtype)
|
||||||
|
M, needs_trunc = _extend(M, sym)
|
||||||
|
w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
|
||||||
|
|
||||||
|
return _truncate(w, needs_trunc)
|
||||||
|
|
||||||
|
|
||||||
|
def get_window(window: Union[str, Tuple[str, float]],
|
||||||
|
win_length: int,
|
||||||
|
fftbins: bool=True,
|
||||||
|
dtype: str='float64') -> Tensor:
|
||||||
|
"""Return a window of a given length and type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
|
||||||
|
win_length (int): Number of samples.
|
||||||
|
fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
|
||||||
|
dtype (str, optional): The data type of the return window. Defaults to 'float64'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: The window represented as a tensor.
|
||||||
|
"""
|
||||||
|
sym = not fftbins
|
||||||
|
|
||||||
|
args = ()
|
||||||
|
if isinstance(window, tuple):
|
||||||
|
winstr = window[0]
|
||||||
|
if len(window) > 1:
|
||||||
|
args = window[1:]
|
||||||
|
elif isinstance(window, str):
|
||||||
|
if window in ['gaussian', 'exponential']:
|
||||||
|
raise ValueError("The '" + window + "' window needs one or "
|
||||||
|
"more parameters -- pass a tuple.")
|
||||||
|
else:
|
||||||
|
winstr = window
|
||||||
|
else:
|
||||||
|
raise ValueError("%s as window type is not supported." %
|
||||||
|
str(type(window)))
|
||||||
|
|
||||||
|
try:
|
||||||
|
winfunc = eval('_' + winstr)
|
||||||
|
except KeyError as e:
|
||||||
|
raise ValueError("Unknown window type.") from e
|
||||||
|
|
||||||
|
params = (win_length, ) + args
|
||||||
|
kwargs = {'sym': sym}
|
||||||
|
return winfunc(*params, dtype=dtype, **kwargs)
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,15 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .eer import compute_eer
|
||||||
|
from .eer import compute_minDCF
|
@ -0,0 +1,100 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
from sklearn.metrics import roc_curve
|
||||||
|
|
||||||
|
|
||||||
|
def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
|
||||||
|
"""Compute EER and return score threshold.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
|
||||||
|
scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[float]: eer and the specific threshold
|
||||||
|
"""
|
||||||
|
fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
|
||||||
|
fnr = 1 - tpr
|
||||||
|
eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
|
||||||
|
eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
|
||||||
|
return eer, eer_threshold
|
||||||
|
|
||||||
|
|
||||||
|
def compute_minDCF(positive_scores,
|
||||||
|
negative_scores,
|
||||||
|
c_miss=1.0,
|
||||||
|
c_fa=1.0,
|
||||||
|
p_target=0.01):
|
||||||
|
"""
|
||||||
|
This is modified from SpeechBrain
|
||||||
|
https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
|
||||||
|
Computes the minDCF metric normally used to evaluate speaker verification
|
||||||
|
systems. The min_DCF is the minimum of the following C_det function computed
|
||||||
|
within the defined threshold range:
|
||||||
|
|
||||||
|
C_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
|
||||||
|
|
||||||
|
where p_miss is the missing probability and p_fa is the probability of having
|
||||||
|
a false alarm.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
positive_scores (Paddle.Tensor): The scores from entries of the same class.
|
||||||
|
negative_scores (Paddle.Tensor): The scores from entries of different classes.
|
||||||
|
c_miss (float, optional): Cost assigned to a missing error (default 1.0).
|
||||||
|
c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
|
||||||
|
p_target (float, optional): Prior probability of having a target (default 0.01).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[float]: min dcf and the specific threshold
|
||||||
|
"""
|
||||||
|
# Computing candidate thresholds
|
||||||
|
if len(positive_scores.shape) > 1:
|
||||||
|
positive_scores = positive_scores.squeeze()
|
||||||
|
|
||||||
|
if len(negative_scores.shape) > 1:
|
||||||
|
negative_scores = negative_scores.squeeze()
|
||||||
|
|
||||||
|
thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
|
||||||
|
thresholds = paddle.unique(thresholds)
|
||||||
|
|
||||||
|
# Adding intermediate thresholds
|
||||||
|
interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
|
||||||
|
thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
|
||||||
|
|
||||||
|
# Computing False Rejection Rate (miss detection)
|
||||||
|
positive_scores = paddle.concat(
|
||||||
|
len(thresholds) * [positive_scores.unsqueeze(0)])
|
||||||
|
pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
|
||||||
|
p_miss = (pos_scores_threshold.sum(0)
|
||||||
|
).astype("float32") / positive_scores.shape[1]
|
||||||
|
del positive_scores
|
||||||
|
del pos_scores_threshold
|
||||||
|
|
||||||
|
# Computing False Acceptance Rate (false alarm)
|
||||||
|
negative_scores = paddle.concat(
|
||||||
|
len(thresholds) * [negative_scores.unsqueeze(0)])
|
||||||
|
neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
|
||||||
|
p_fa = (neg_scores_threshold.sum(0)
|
||||||
|
).astype("float32") / negative_scores.shape[1]
|
||||||
|
del negative_scores
|
||||||
|
del neg_scores_threshold
|
||||||
|
|
||||||
|
c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
|
||||||
|
c_min = paddle.min(c_det, axis=0)
|
||||||
|
min_index = paddle.argmin(c_det, axis=0)
|
||||||
|
return float(c_min), float(thresholds[min_index])
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,27 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .download import decompress
|
||||||
|
from .download import download_and_decompress
|
||||||
|
from .download import load_state_dict_from_url
|
||||||
|
from .env import DATA_HOME
|
||||||
|
from .env import MODEL_HOME
|
||||||
|
from .env import PPAUDIO_HOME
|
||||||
|
from .env import USER_HOME
|
||||||
|
from .error import ParameterError
|
||||||
|
from .log import Logger
|
||||||
|
from .log import logger
|
||||||
|
from .time import seconds_to_hms
|
||||||
|
from .time import Timer
|
||||||
|
from .numeric import depth_convert
|
||||||
|
from .numeric import pcm16to32
|
@ -0,0 +1,64 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import os
|
||||||
|
from typing import Dict
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from paddle.framework import load as load_state_dict
|
||||||
|
from paddle.utils import download
|
||||||
|
|
||||||
|
from .log import logger
|
||||||
|
|
||||||
|
download.logger = logger
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'decompress',
|
||||||
|
'download_and_decompress',
|
||||||
|
'load_state_dict_from_url',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def decompress(file: str):
|
||||||
|
"""
|
||||||
|
Extracts all files from a compressed file.
|
||||||
|
"""
|
||||||
|
assert os.path.isfile(file), "File: {} not exists.".format(file)
|
||||||
|
download._decompress(file)
|
||||||
|
|
||||||
|
|
||||||
|
def download_and_decompress(archives: List[Dict[str, str]],
|
||||||
|
path: str,
|
||||||
|
decompress: bool=True):
|
||||||
|
"""
|
||||||
|
Download archieves and decompress to specific path.
|
||||||
|
"""
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
for archive in archives:
|
||||||
|
assert 'url' in archive and 'md5' in archive, \
|
||||||
|
'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
|
||||||
|
download.get_path_from_url(
|
||||||
|
archive['url'], path, archive['md5'], decompress=decompress)
|
||||||
|
|
||||||
|
|
||||||
|
def load_state_dict_from_url(url: str, path: str, md5: str=None):
|
||||||
|
"""
|
||||||
|
Download and load a state dict from url
|
||||||
|
"""
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
download.get_path_from_url(url, path, md5)
|
||||||
|
return load_state_dict(os.path.join(path, os.path.basename(url)))
|
@ -0,0 +1,60 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
'''
|
||||||
|
This module is used to store environmental variables in PaddleAudio.
|
||||||
|
PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
|
||||||
|
├ default value through the PPAUDIO_HOME environment variable.
|
||||||
|
├─ MODEL_HOME --> Store model files.
|
||||||
|
└─ DATA_HOME --> Store automatically downloaded datasets.
|
||||||
|
'''
|
||||||
|
import os
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'USER_HOME',
|
||||||
|
'PPAUDIO_HOME',
|
||||||
|
'MODEL_HOME',
|
||||||
|
'DATA_HOME',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_user_home():
|
||||||
|
return os.path.expanduser('~')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ppaudio_home():
|
||||||
|
if 'PPAUDIO_HOME' in os.environ:
|
||||||
|
home_path = os.environ['PPAUDIO_HOME']
|
||||||
|
if os.path.exists(home_path):
|
||||||
|
if os.path.isdir(home_path):
|
||||||
|
return home_path
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
'The environment variable PPAUDIO_HOME {} is not a directory.'.
|
||||||
|
format(home_path))
|
||||||
|
else:
|
||||||
|
return home_path
|
||||||
|
return os.path.join(_get_user_home(), '.paddleaudio')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_sub_home(directory):
|
||||||
|
home = os.path.join(_get_ppaudio_home(), directory)
|
||||||
|
if not os.path.exists(home):
|
||||||
|
os.makedirs(home)
|
||||||
|
return home
|
||||||
|
|
||||||
|
|
||||||
|
USER_HOME = _get_user_home()
|
||||||
|
PPAUDIO_HOME = _get_ppaudio_home()
|
||||||
|
MODEL_HOME = _get_sub_home('models')
|
||||||
|
DATA_HOME = _get_sub_home('datasets')
|
@ -0,0 +1,20 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
__all__ = ['ParameterError']
|
||||||
|
|
||||||
|
|
||||||
|
class ParameterError(Exception):
|
||||||
|
"""Exception class for Parameter checking"""
|
||||||
|
pass
|
@ -0,0 +1,139 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import contextlib
|
||||||
|
import functools
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
|
import colorlog
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'Logger',
|
||||||
|
'logger',
|
||||||
|
]
|
||||||
|
|
||||||
|
log_config = {
|
||||||
|
'DEBUG': {
|
||||||
|
'level': 10,
|
||||||
|
'color': 'purple'
|
||||||
|
},
|
||||||
|
'INFO': {
|
||||||
|
'level': 20,
|
||||||
|
'color': 'green'
|
||||||
|
},
|
||||||
|
'TRAIN': {
|
||||||
|
'level': 21,
|
||||||
|
'color': 'cyan'
|
||||||
|
},
|
||||||
|
'EVAL': {
|
||||||
|
'level': 22,
|
||||||
|
'color': 'blue'
|
||||||
|
},
|
||||||
|
'WARNING': {
|
||||||
|
'level': 30,
|
||||||
|
'color': 'yellow'
|
||||||
|
},
|
||||||
|
'ERROR': {
|
||||||
|
'level': 40,
|
||||||
|
'color': 'red'
|
||||||
|
},
|
||||||
|
'CRITICAL': {
|
||||||
|
'level': 50,
|
||||||
|
'color': 'bold_red'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Logger(object):
|
||||||
|
'''
|
||||||
|
Deafult logger in PaddleAudio
|
||||||
|
Args:
|
||||||
|
name(str) : Logger name, default is 'PaddleAudio'
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, name: str=None):
|
||||||
|
name = 'PaddleAudio' if not name else name
|
||||||
|
self.logger = logging.getLogger(name)
|
||||||
|
|
||||||
|
for key, conf in log_config.items():
|
||||||
|
logging.addLevelName(conf['level'], key)
|
||||||
|
self.__dict__[key] = functools.partial(self.__call__, conf['level'])
|
||||||
|
self.__dict__[key.lower()] = functools.partial(self.__call__,
|
||||||
|
conf['level'])
|
||||||
|
|
||||||
|
self.format = colorlog.ColoredFormatter(
|
||||||
|
'%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
|
||||||
|
log_colors={key: conf['color']
|
||||||
|
for key, conf in log_config.items()})
|
||||||
|
|
||||||
|
self.handler = logging.StreamHandler()
|
||||||
|
self.handler.setFormatter(self.format)
|
||||||
|
|
||||||
|
self.logger.addHandler(self.handler)
|
||||||
|
self.logLevel = 'DEBUG'
|
||||||
|
self.logger.setLevel(logging.DEBUG)
|
||||||
|
self.logger.propagate = False
|
||||||
|
self._is_enable = True
|
||||||
|
|
||||||
|
def disable(self):
|
||||||
|
self._is_enable = False
|
||||||
|
|
||||||
|
def enable(self):
|
||||||
|
self._is_enable = True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_enable(self) -> bool:
|
||||||
|
return self._is_enable
|
||||||
|
|
||||||
|
def __call__(self, log_level: str, msg: str):
|
||||||
|
if not self.is_enable:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.logger.log(log_level, msg)
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def use_terminator(self, terminator: str):
|
||||||
|
old_terminator = self.handler.terminator
|
||||||
|
self.handler.terminator = terminator
|
||||||
|
yield
|
||||||
|
self.handler.terminator = old_terminator
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def processing(self, msg: str, interval: float=0.1):
|
||||||
|
'''
|
||||||
|
Continuously print a progress bar with rotating special effects.
|
||||||
|
Args:
|
||||||
|
msg(str): Message to be printed.
|
||||||
|
interval(float): Rotation interval. Default to 0.1.
|
||||||
|
'''
|
||||||
|
end = False
|
||||||
|
|
||||||
|
def _printer():
|
||||||
|
index = 0
|
||||||
|
flags = ['\\', '|', '/', '-']
|
||||||
|
while not end:
|
||||||
|
flag = flags[index % len(flags)]
|
||||||
|
with self.use_terminator('\r'):
|
||||||
|
self.info('{}: {}'.format(msg, flag))
|
||||||
|
time.sleep(interval)
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
t = threading.Thread(target=_printer)
|
||||||
|
t.start()
|
||||||
|
yield
|
||||||
|
end = True
|
||||||
|
|
||||||
|
|
||||||
|
logger = Logger()
|
@ -0,0 +1,107 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
__all__ = ["pcm16to32", "depth_convert"]
|
||||||
|
|
||||||
|
|
||||||
|
def pcm16to32(audio: np.ndarray) -> np.ndarray:
|
||||||
|
"""pcm int16 to float32
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio (np.ndarray): Waveform with dtype of int16.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Waveform with dtype of float32.
|
||||||
|
"""
|
||||||
|
if audio.dtype == np.int16:
|
||||||
|
audio = audio.astype("float32")
|
||||||
|
bits = np.iinfo(np.int16).bits
|
||||||
|
audio = audio / (2**(bits - 1))
|
||||||
|
return audio
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
|
||||||
|
"""Data type casting in a safe way, i.e., prevent overflow or underflow.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
dtype (Union[type, str]): Data type of waveform.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` after safe casting.
|
||||||
|
"""
|
||||||
|
if 'float' in str(y.dtype):
|
||||||
|
return np.clip(y, np.finfo(dtype).min,
|
||||||
|
np.finfo(dtype).max).astype(dtype)
|
||||||
|
else:
|
||||||
|
return np.clip(y, np.iinfo(dtype).min,
|
||||||
|
np.iinfo(dtype).max).astype(dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
|
||||||
|
"""Convert audio array to target dtype safely.
|
||||||
|
This function convert audio waveform to a target dtype, with addition steps of
|
||||||
|
preventing overflow/underflow and preserving audio range.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
y (np.ndarray): Input waveform array in 1D or 2D.
|
||||||
|
dtype (Union[type, str]): Data type of waveform.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: `y` after safe casting.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
|
||||||
|
if y.dtype not in SUPPORT_DTYPE:
|
||||||
|
raise ParameterError(
|
||||||
|
'Unsupported audio dtype, '
|
||||||
|
f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
|
||||||
|
|
||||||
|
if dtype not in SUPPORT_DTYPE:
|
||||||
|
raise ParameterError(
|
||||||
|
'Unsupported audio dtype, '
|
||||||
|
f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
|
||||||
|
|
||||||
|
if dtype == y.dtype:
|
||||||
|
return y
|
||||||
|
|
||||||
|
if dtype == 'float64' and y.dtype == 'float32':
|
||||||
|
return _safe_cast(y, dtype)
|
||||||
|
if dtype == 'float32' and y.dtype == 'float64':
|
||||||
|
return _safe_cast(y, dtype)
|
||||||
|
|
||||||
|
if dtype == 'int16' or dtype == 'int8':
|
||||||
|
if y.dtype in ['float64', 'float32']:
|
||||||
|
factor = np.iinfo(dtype).max
|
||||||
|
y = np.clip(y * factor, np.iinfo(dtype).min,
|
||||||
|
np.iinfo(dtype).max).astype(dtype)
|
||||||
|
y = y.astype(dtype)
|
||||||
|
else:
|
||||||
|
if dtype == 'int16' and y.dtype == 'int8':
|
||||||
|
factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
|
||||||
|
y = y.astype('float32') * factor
|
||||||
|
y = y.astype('int16')
|
||||||
|
|
||||||
|
else: # dtype == 'int8' and y.dtype=='int16':
|
||||||
|
y = y.astype('int32') * np.iinfo('int8').max / \
|
||||||
|
np.iinfo('int16').max
|
||||||
|
y = y.astype('int8')
|
||||||
|
|
||||||
|
if dtype in ['float32', 'float64']:
|
||||||
|
org_dtype = y.dtype
|
||||||
|
y = y.astype(dtype) / np.iinfo(org_dtype).max
|
||||||
|
return y
|
@ -0,0 +1,72 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'Timer',
|
||||||
|
'seconds_to_hms',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class Timer(object):
|
||||||
|
'''Calculate runing speed and estimated time of arrival(ETA)'''
|
||||||
|
|
||||||
|
def __init__(self, total_step: int):
|
||||||
|
self.total_step = total_step
|
||||||
|
self.last_start_step = 0
|
||||||
|
self.current_step = 0
|
||||||
|
self._is_running = True
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.last_time = time.time()
|
||||||
|
self.start_time = time.time()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self._is_running = False
|
||||||
|
self.end_time = time.time()
|
||||||
|
|
||||||
|
def count(self) -> int:
|
||||||
|
if not self.current_step >= self.total_step:
|
||||||
|
self.current_step += 1
|
||||||
|
return self.current_step
|
||||||
|
|
||||||
|
@property
|
||||||
|
def timing(self) -> float:
|
||||||
|
run_steps = self.current_step - self.last_start_step
|
||||||
|
self.last_start_step = self.current_step
|
||||||
|
time_used = time.time() - self.last_time
|
||||||
|
self.last_time = time.time()
|
||||||
|
return run_steps / time_used
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_running(self) -> bool:
|
||||||
|
return self._is_running
|
||||||
|
|
||||||
|
@property
|
||||||
|
def eta(self) -> str:
|
||||||
|
if not self.is_running:
|
||||||
|
return '00:00:00'
|
||||||
|
scale = self.total_step / self.current_step
|
||||||
|
remaining_time = (time.time() - self.start_time) * scale
|
||||||
|
return seconds_to_hms(remaining_time)
|
||||||
|
|
||||||
|
|
||||||
|
def seconds_to_hms(seconds: int) -> str:
|
||||||
|
'''Convert the number of seconds to hh:mm:ss'''
|
||||||
|
h = math.floor(seconds / 3600)
|
||||||
|
m = math.floor((seconds - h * 3600) / 60)
|
||||||
|
s = int(seconds - h * 3600 - m * 60)
|
||||||
|
hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
|
||||||
|
return hms_str
|
@ -0,0 +1,99 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
|
||||||
|
import setuptools
|
||||||
|
from setuptools.command.install import install
|
||||||
|
from setuptools.command.test import test
|
||||||
|
|
||||||
|
# set the version here
|
||||||
|
VERSION = '1.0.2'
|
||||||
|
|
||||||
|
|
||||||
|
# Inspired by the example at https://pytest.org/latest/goodpractises.html
|
||||||
|
class TestCommand(test):
|
||||||
|
def finalize_options(self):
|
||||||
|
test.finalize_options(self)
|
||||||
|
self.test_args = []
|
||||||
|
self.test_suite = True
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.run_benchmark()
|
||||||
|
super(TestCommand, self).run()
|
||||||
|
|
||||||
|
def run_tests(self):
|
||||||
|
# Run nose ensuring that argv simulates running nosetests directly
|
||||||
|
import nose
|
||||||
|
nose.run_exit(argv=['nosetests', '-w', 'tests'])
|
||||||
|
|
||||||
|
def run_benchmark(self):
|
||||||
|
for benchmark_item in glob.glob('tests/benchmark/*py'):
|
||||||
|
os.system(f'pytest {benchmark_item}')
|
||||||
|
|
||||||
|
|
||||||
|
class InstallCommand(install):
|
||||||
|
def run(self):
|
||||||
|
install.run(self)
|
||||||
|
|
||||||
|
|
||||||
|
def write_version_py(filename='paddleaudio/__init__.py'):
|
||||||
|
with open(filename, "a") as f:
|
||||||
|
f.write(f"__version__ = '{VERSION}'")
|
||||||
|
|
||||||
|
|
||||||
|
def remove_version_py(filename='paddleaudio/__init__.py'):
|
||||||
|
with open(filename, "r") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
for line in lines:
|
||||||
|
if "__version__" not in line:
|
||||||
|
f.write(line)
|
||||||
|
|
||||||
|
|
||||||
|
remove_version_py()
|
||||||
|
write_version_py()
|
||||||
|
|
||||||
|
setuptools.setup(
|
||||||
|
name="paddleaudio",
|
||||||
|
version=VERSION,
|
||||||
|
author="",
|
||||||
|
author_email="",
|
||||||
|
description="PaddleAudio, in development",
|
||||||
|
long_description="",
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
url="",
|
||||||
|
packages=setuptools.find_packages(include=['paddleaudio*']),
|
||||||
|
classifiers=[
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
],
|
||||||
|
python_requires='>=3.6',
|
||||||
|
install_requires=[
|
||||||
|
'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
|
||||||
|
'soundfile >= 0.9.0', 'colorlog', 'pathos == 0.2.8'
|
||||||
|
],
|
||||||
|
extras_require={
|
||||||
|
'test': [
|
||||||
|
'nose', 'librosa==0.8.1', 'soundfile==0.10.3.post1',
|
||||||
|
'torchaudio==0.10.2', 'pytest-benchmark'
|
||||||
|
],
|
||||||
|
},
|
||||||
|
cmdclass={
|
||||||
|
'install': InstallCommand,
|
||||||
|
'test': TestCommand,
|
||||||
|
}, )
|
||||||
|
|
||||||
|
remove_version_py()
|
Loading…
Reference in new issue