split paddleaudio

pull/2388/head
Yang Zhou 3 years ago
parent cfd32d00de
commit 4df081b954

@ -0,0 +1,21 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import compliance
from . import datasets
from . import features
from . import functional
from . import io
from . import metric
from . import sox_effects
from . import backends

@ -0,0 +1,148 @@
import importlib.util
import warnings
from functools import wraps
from typing import Optional
#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py
def is_module_available(*modules: str) -> bool:
r"""Returns if a top-level module with :attr:`name` exists *without**
importing it. This is generally safer than try-catch block around a
`import X`. It avoids third party libraries breaking assumptions of some of
our tests, e.g., setting multiprocessing start method when imported
(see librosa/#747, torchvision/#544).
"""
return all(importlib.util.find_spec(m) is not None for m in modules)
def requires_module(*modules: str):
"""Decorate function to give error message if invoked without required optional modules.
This decorator is to give better error message to users rather
than raising ``NameError: name 'module' is not defined`` at random places.
"""
missing = [m for m in modules if not is_module_available(m)]
if not missing:
# fall through. If all the modules are available, no need to decorate
def decorator(func):
return func
else:
req = f"module: {missing[0]}" if len(
missing) == 1 else f"modules: {missing}"
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires {req}")
return wrapped
return decorator
def deprecated(direction: str, version: Optional[str]=None):
"""Decorator to add deprecation message
Args:
direction (str): Migration steps to be given to users.
version (str or int): The version when the object will be removed
"""
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
message = (
f"{func.__module__}.{func.__name__} has been deprecated "
f'and will be removed from {"future" if version is None else version} release. '
f"{direction}")
warnings.warn(message, stacklevel=2)
return func(*args, **kwargs)
return wrapped
return decorator
def is_kaldi_available():
return is_module_available("paddleaudio._paddleaudio")
def requires_kaldi():
if is_kaldi_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires kaldi")
return wrapped
return decorator
def _check_soundfile_importable():
if not is_module_available("soundfile"):
return False
try:
import soundfile # noqa: F401
return True
except Exception:
warnings.warn(
"Failed to import soundfile. 'soundfile' backend is not available.")
return False
_is_soundfile_importable = _check_soundfile_importable()
def is_soundfile_available():
return _is_soundfile_importable
def requires_soundfile():
if is_soundfile_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires soundfile")
return wrapped
return decorator
def is_sox_available():
return is_module_available("paddleaudio._paddleaudio")
def requires_sox():
if is_sox_available():
def decorator(func):
return func
else:
def decorator(func):
@wraps(func)
def wrapped(*args, **kwargs):
raise RuntimeError(
f"{func.__module__}.{func.__name__} requires sox")
return wrapped
return decorator

@ -0,0 +1,26 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .soundfile_backend import depth_convert
from .soundfile_backend import soundfile_load
from .soundfile_backend import normalize
from .soundfile_backend import resample
from .soundfile_backend import soundfile_save
from .soundfile_backend import to_mono
from . import utils
from .utils import get_audio_backend
from .utils import list_audio_backends
from .utils import set_audio_backend
utils._init_audio_backend()

@ -0,0 +1,55 @@
# Token form https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification.
class AudioInfo:
"""return of info function.
This class is used by :ref:`"sox_io" backend<sox_io_backend>` and
:ref:`"soundfile" backend with the new interface<soundfile_backend>`.
:ivar int sample_rate: Sample rate
:ivar int num_frames: The number of frames
:ivar int num_channels: The number of channels
:ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
or when it cannot be accurately inferred.
:ivar str encoding: Audio encoding
The values encoding can take are one of the following:
* ``PCM_S``: Signed integer linear PCM
* ``PCM_U``: Unsigned integer linear PCM
* ``PCM_F``: Floating point linear PCM
* ``FLAC``: Flac, Free Lossless Audio Codec
* ``ULAW``: Mu-law
* ``ALAW``: A-law
* ``MP3`` : MP3, MPEG-1 Audio Layer III
* ``VORBIS``: OGG Vorbis
* ``AMR_WB``: Adaptive Multi-Rate
* ``AMR_NB``: Adaptive Multi-Rate Wideband
* ``OPUS``: Opus
* ``HTK``: Single channel 16-bit PCM
* ``UNKNOWN`` : None of above
"""
def __init__(
self,
sample_rate: int,
num_frames: int,
num_channels: int,
bits_per_sample: int,
encoding: str,
):
self.sample_rate = sample_rate
self.num_frames = num_frames
self.num_channels = num_channels
self.bits_per_sample = bits_per_sample
self.encoding = encoding
def __str__(self):
return (
f"AudioMetaData("
f"sample_rate={self.sample_rate}, "
f"num_frames={self.num_frames}, "
f"num_channels={self.num_channels}, "
f"bits_per_sample={self.bits_per_sample}, "
f"encoding={self.encoding}"
f")"
)

@ -0,0 +1,32 @@
from pathlib import Path
from typing import Callable
from typing import Optional
from typing import Tuple
from typing import Union
from paddle import Tensor
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
def load(
filepath: Union[str, Path],
out: Optional[Tensor]=None,
normalization: Union[bool, float, Callable]=True,
channels_first: bool=True,
num_frames: int=0,
offset: int=0,
filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
raise RuntimeError("No audio I/O backend is available.")
def save(filepath: str,
src: Tensor,
sample_rate: int,
precision: int=16,
channels_first: bool=True) -> None:
raise RuntimeError("No audio I/O backend is available.")
def info(filepath: str) -> None:
raise RuntimeError("No audio I/O backend is available.")

@ -0,0 +1,661 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import warnings
from typing import Optional
from typing import Tuple
import numpy as np
import paddle
import resampy
import soundfile
from scipy.io import wavfile
from ..utils import depth_convert
from ..utils import ParameterError
from .common import AudioInfo
__all__ = [
'resample',
'to_mono',
'normalize',
'save',
'soundfile_save',
'load',
'soundfile_load',
'info',
]
NORMALMIZE_TYPES = ['linear', 'gaussian']
MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
EPS = 1e-8
def resample(y: np.ndarray,
src_sr: int,
target_sr: int,
mode: str='kaiser_fast') -> np.ndarray:
"""Audio resampling.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
src_sr (int): Source sample rate.
target_sr (int): Target sample rate.
mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
Returns:
np.ndarray: `y` resampled to `target_sr`
"""
if mode == 'kaiser_best':
warnings.warn(
f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
we recommend the mode kaiser_fast in large scale audio trainning')
if not isinstance(y, np.ndarray):
raise ParameterError(
'Only support numpy np.ndarray, but received y in {type(y)}')
if mode not in RESAMPLE_MODES:
raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
return resampy.resample(y, src_sr, target_sr, filter=mode)
def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
"""Convert sterior audio to mono.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
Returns:
np.ndarray: `y` with mono channel.
"""
if merge_type not in MERGE_TYPES:
raise ParameterError(
f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
)
if y.ndim > 2:
raise ParameterError(
f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}')
if y.ndim == 1: # nothing to merge
return y
if merge_type == 'ch0':
return y[0]
if merge_type == 'ch1':
return y[1]
if merge_type == 'random':
return y[np.random.randint(0, 2)]
# need to do averaging according to dtype
if y.dtype == 'float32':
y_out = (y[0] + y[1]) * 0.5
elif y.dtype == 'int16':
y_out = y.astype('int32')
y_out = (y_out[0] + y_out[1]) // 2
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
np.iinfo(y.dtype).max).astype(y.dtype)
elif y.dtype == 'int8':
y_out = y.astype('int16')
y_out = (y_out[0] + y_out[1]) // 2
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
np.iinfo(y.dtype).max).astype(y.dtype)
else:
raise ParameterError(f'Unsupported dtype: {y.dtype}')
return y_out
def soundfile_load_(file: os.PathLike,
offset: Optional[float]=None,
dtype: str='int16',
duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
"""Load audio using soundfile library. This function load audio file using libsndfile.
Args:
file (os.PathLike): File of waveform.
offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
dtype (str, optional): Data type of waveform. Defaults to 'int16'.
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
Returns:
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
"""
with soundfile.SoundFile(file) as sf_desc:
sr_native = sf_desc.samplerate
if offset:
sf_desc.seek(int(offset * sr_native))
if duration is not None:
frame_duration = int(duration * sr_native)
else:
frame_duration = -1
y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
return y, sf_desc.samplerate
def normalize(y: np.ndarray, norm_type: str='linear',
mul_factor: float=1.0) -> np.ndarray:
"""Normalize an input audio with additional multiplier.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
mul_factor (float, optional): Scaling factor. Defaults to 1.0.
Returns:
np.ndarray: `y` after normalization.
"""
if norm_type == 'linear':
amax = np.max(np.abs(y))
factor = 1.0 / (amax + EPS)
y = y * factor * mul_factor
elif norm_type == 'gaussian':
amean = np.mean(y)
astd = np.std(y)
astd = max(astd, EPS)
y = mul_factor * (y - amean) / astd
else:
raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
return y
def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
"""Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
sr (int): Sample rate.
file (os.PathLike): Path of auido file to save.
"""
if not file.endswith('.wav'):
raise ParameterError(
f'only .wav file supported, but dst file name is: {file}')
if sr <= 0:
raise ParameterError(
f'Sample rate should be larger than 0, recieved sr = {sr}')
if y.dtype not in ['int16', 'int8']:
warnings.warn(
f'input data type is {y.dtype}, will convert data to int16 format before saving'
)
y_out = depth_convert(y, 'int16')
else:
y_out = y
wavfile.write(file, sr, y_out)
def soundfile_load(
file: os.PathLike,
sr: Optional[int]=None,
mono: bool=True,
merge_type: str='average', # ch0,ch1,random,average
normal: bool=True,
norm_type: str='linear',
norm_mul_factor: float=1.0,
offset: float=0.0,
duration: Optional[int]=None,
dtype: str='float32',
resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
"""Load audio file from disk. This function loads audio from disk using using audio beackend.
Args:
file (os.PathLike): Path of auido file to load.
sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
mono (bool, optional): Return waveform with mono channel. Defaults to True.
merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
normal (bool, optional): Waveform normalization. Defaults to True.
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
dtype (str, optional): Data type of waveform. Defaults to 'float32'.
resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
Returns:
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
"""
y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
raise ParameterError(f'audio file {file} looks empty')
if mono:
y = to_mono(y, merge_type)
if sr is not None and sr != r:
y = resample(y, r, sr, mode=resample_mode)
r = sr
if normal:
y = normalize(y, norm_type, norm_mul_factor)
elif dtype in ['int8', 'int16']:
# still need to do normalization, before depth convertion
y = normalize(y, 'linear', 1.0)
y = depth_convert(y, dtype)
return y, r
#the code below token form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py with modificaion.
def _get_subtype_for_wav(dtype: paddle.dtype, encoding: str, bits_per_sample: int):
if not encoding:
if not bits_per_sample:
subtype = {
paddle.uint8: "PCM_U8",
paddle.int16: "PCM_16",
paddle.int32: "PCM_32",
paddle.float32: "FLOAT",
paddle.float64: "DOUBLE",
}.get(dtype)
if not subtype:
raise ValueError(f"Unsupported dtype for wav: {dtype}")
return subtype
if bits_per_sample == 8:
return "PCM_U8"
return f"PCM_{bits_per_sample}"
if encoding == "PCM_S":
if not bits_per_sample:
return "PCM_32"
if bits_per_sample == 8:
raise ValueError("wav does not support 8-bit signed PCM encoding.")
return f"PCM_{bits_per_sample}"
if encoding == "PCM_U":
if bits_per_sample in (None, 8):
return "PCM_U8"
raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
if encoding == "PCM_F":
if bits_per_sample in (None, 32):
return "FLOAT"
if bits_per_sample == 64:
return "DOUBLE"
raise ValueError("wav only supports 32/64-bit float PCM encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("wav only supports 8-bit mu-law encoding.")
if encoding == "ALAW":
if bits_per_sample in (None, 8):
return "ALAW"
raise ValueError("wav only supports 8-bit a-law encoding.")
raise ValueError(f"wav does not support {encoding}.")
def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
if encoding in (None, "PCM_S"):
return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
if encoding in ("PCM_U", "PCM_F"):
raise ValueError(f"sph does not support {encoding} encoding.")
if encoding == "ULAW":
if bits_per_sample in (None, 8):
return "ULAW"
raise ValueError("sph only supports 8-bit for mu-law encoding.")
if encoding == "ALAW":
return "ALAW"
raise ValueError(f"sph does not support {encoding}.")
def _get_subtype(dtype: paddle.dtype, format: str, encoding: str, bits_per_sample: int):
if format == "wav":
return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
if format == "flac":
if encoding:
raise ValueError("flac does not support encoding.")
if not bits_per_sample:
return "PCM_16"
if bits_per_sample > 24:
raise ValueError("flac does not support bits_per_sample > 24.")
return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
if format in ("ogg", "vorbis"):
if encoding or bits_per_sample:
raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
return "VORBIS"
if format == "sph":
return _get_subtype_for_sphere(encoding, bits_per_sample)
if format in ("nis", "nist"):
return "PCM_16"
raise ValueError(f"Unsupported format: {format}")
def save(
filepath: str,
src: paddle.Tensor,
sample_rate: int,
channels_first: bool = True,
compression: Optional[float] = None,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
):
"""Save audio data to file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
Args:
filepath (str or pathlib.Path): Path to audio file.
src (paddle.Tensor): Audio data to save. must be 2D tensor.
sample_rate (int): sampling rate
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
otherwise `[time, channel]`.
compression (float of None, optional): Not used.
It is here only for interface compatibility reson with "sox_io" backend.
format (str or None, optional): Override the audio format.
When ``filepath`` argument is path-like object, audio format is
inferred from file extension. If the file extension is missing or
different, you can specify the correct format with this argument.
When ``filepath`` argument is file-like object,
this argument is required.
Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
``"flac"`` and ``"sph"``.
encoding (str or None, optional): Changes the encoding for supported formats.
This argument is effective only for supported formats, sush as
``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
- ``"PCM_S"`` (signed integer Linear PCM)
- ``"PCM_U"`` (unsigned integer Linear PCM)
- ``"PCM_F"`` (floating point PCM)
- ``"ULAW"`` (mu-law)
- ``"ALAW"`` (a-law)
bits_per_sample (int or None, optional): Changes the bit depth for the
supported formats.
When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
you can change the bit depth.
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
Supported formats/encodings/bit depth/compression are:
``"wav"``
- 32-bit floating-point PCM
- 32-bit signed integer PCM
- 24-bit signed integer PCM
- 16-bit signed integer PCM
- 8-bit unsigned integer PCM
- 8-bit mu-law
- 8-bit a-law
Note:
Default encoding/bit depth is determined by the dtype of
the input Tensor.
``"flac"``
- 8-bit
- 16-bit (default)
- 24-bit
``"ogg"``, ``"vorbis"``
- Doesn't accept changing configuration.
``"sph"``
- 8-bit signed integer PCM
- 16-bit signed integer PCM
- 24-bit signed integer PCM
- 32-bit signed integer PCM (default)
- 8-bit mu-law
- 8-bit a-law
- 16-bit a-law
- 24-bit a-law
- 32-bit a-law
"""
if src.ndim != 2:
raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
if compression is not None:
warnings.warn(
'`save` function of "soundfile" backend does not support "compression" parameter. '
"The argument is silently ignored."
)
if hasattr(filepath, "write"):
if format is None:
raise RuntimeError("`format` is required when saving to file object.")
ext = format.lower()
else:
ext = str(filepath).split(".")[-1].lower()
if bits_per_sample not in (None, 8, 16, 24, 32, 64):
raise ValueError("Invalid bits_per_sample.")
if bits_per_sample == 24:
warnings.warn(
"Saving audio with 24 bits per sample might warp samples near -1. "
"Using 16 bits per sample might be able to avoid this."
)
subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
# sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
# so we extend the extensions manually here
if ext in ["nis", "nist", "sph"] and format is None:
format = "NIST"
if channels_first:
src = src.t()
soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
_SUBTYPE2DTYPE = {
"PCM_S8": "int8",
"PCM_U8": "uint8",
"PCM_16": "int16",
"PCM_32": "int32",
"FLOAT": "float32",
"DOUBLE": "float64",
}
def load(
filepath: str,
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[paddle.Tensor, int]:
"""Load audio data from file.
Note:
The formats this function can handle depend on the soundfile installation.
This function is tested on the following formats;
* WAV
* 32-bit floating-point
* 32-bit signed integer
* 16-bit signed integer
* 8-bit unsigned integer
* FLAC
* OGG/VORBIS
* SPHERE
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
``float32`` dtype and the shape of `[channel, time]`.
The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
by providing ``normalize=False``, this function can return integer Tensor, where the samples
are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
``flac`` and ``mp3``.
For these formats, this function always returns ``float32`` Tensor with values normalized to
``[-1.0, 1.0]``.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
Args:
filepath (path-like object or file-like object):
Source of audio data.
frame_offset (int, optional):
Number of frames to skip before start reading data.
num_frames (int, optional):
Maximum number of frames to read. ``-1`` reads all the remaining samples,
starting from ``frame_offset``.
This function may return the less number of frames if there is not enough
frames in the given file.
normalize (bool, optional):
When ``True``, this function always return ``float32``, and sample values are
normalized to ``[-1.0, 1.0]``.
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
integer type.
This argument has no effect for formats other than integer WAV type.
channels_first (bool, optional):
When True, the returned Tensor has dimension `[channel, time]`.
Otherwise, the returned Tensor's dimension is `[time, channel]`.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
(paddle.Tensor, int): Resulting Tensor and sample rate.
If the input file has integer wav format and normalization is off, then it has
integer type, else ``float32`` type. If ``channels_first=True``, it has
`[channel, time]` else `[time, channel]`.
"""
with soundfile.SoundFile(filepath, "r") as file_:
if file_.format != "WAV" or normalize:
dtype = "float32"
elif file_.subtype not in _SUBTYPE2DTYPE:
raise ValueError(f"Unsupported subtype: {file_.subtype}")
else:
dtype = _SUBTYPE2DTYPE[file_.subtype]
frames = file_._prepare_read(frame_offset, None, num_frames)
waveform = file_.read(frames, dtype, always_2d=True)
sample_rate = file_.samplerate
waveform = paddle.to_tensor(waveform)
if channels_first:
waveform = paddle.transpose(waveform, perm=[1,0])
return waveform, sample_rate
# Mapping from soundfile subtype to number of bits per sample.
# This is mostly heuristical and the value is set to 0 when it is irrelevant
# (lossy formats) or when it can't be inferred.
# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
# the default seems to be 8 bits but it can be compressed further to 4 bits.
# The dict is inspired from
# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
_SUBTYPE_TO_BITS_PER_SAMPLE = {
"PCM_S8": 8, # Signed 8 bit data
"PCM_16": 16, # Signed 16 bit data
"PCM_24": 24, # Signed 24 bit data
"PCM_32": 32, # Signed 32 bit data
"PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only)
"FLOAT": 32, # 32 bit float data
"DOUBLE": 64, # 64 bit float data
"ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
"IMA_ADPCM": 0, # IMA ADPCM.
"MS_ADPCM": 0, # Microsoft ADPCM.
"GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
"VOX_ADPCM": 0, # OKI / Dialogix ADPCM
"G721_32": 0, # 32kbs G721 ADPCM encoding.
"G723_24": 0, # 24kbs G723 ADPCM encoding.
"G723_40": 0, # 40kbs G723 ADPCM encoding.
"DWVW_12": 12, # 12 bit Delta Width Variable Word encoding.
"DWVW_16": 16, # 16 bit Delta Width Variable Word encoding.
"DWVW_24": 24, # 24 bit Delta Width Variable Word encoding.
"DWVW_N": 0, # N bit Delta Width Variable Word encoding.
"DPCM_8": 8, # 8 bit differential PCM (XI only)
"DPCM_16": 16, # 16 bit differential PCM (XI only)
"VORBIS": 0, # Xiph Vorbis encoding. (lossy)
"ALAC_16": 16, # Apple Lossless Audio Codec (16 bit).
"ALAC_20": 20, # Apple Lossless Audio Codec (20 bit).
"ALAC_24": 24, # Apple Lossless Audio Codec (24 bit).
"ALAC_32": 32, # Apple Lossless Audio Codec (32 bit).
}
def _get_bit_depth(subtype):
if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
warnings.warn(
f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
"attribute will be set to 0. If you are seeing this warning, please "
"report by opening an issue on github (after checking for existing/closed ones). "
"You may otherwise ignore this warning."
)
return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
_SUBTYPE_TO_ENCODING = {
"PCM_S8": "PCM_S",
"PCM_16": "PCM_S",
"PCM_24": "PCM_S",
"PCM_32": "PCM_S",
"PCM_U8": "PCM_U",
"FLOAT": "PCM_F",
"DOUBLE": "PCM_F",
"ULAW": "ULAW",
"ALAW": "ALAW",
"VORBIS": "VORBIS",
}
def _get_encoding(format: str, subtype: str):
if format == "FLAC":
return "FLAC"
return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
def info(filepath: str, format: Optional[str] = None) -> AudioInfo:
"""Get signal information of an audio file.
Note:
``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
Args:
filepath (path-like object or file-like object):
Source of audio data.
format (str or None, optional):
Not used. PySoundFile does not accept format hint.
Returns:
AudioInfo: meta data of the given audio.
"""
sinfo = soundfile.info(filepath)
return AudioInfo(
sinfo.samplerate,
sinfo.frames,
sinfo.channels,
bits_per_sample=_get_bit_depth(sinfo.subtype),
encoding=_get_encoding(sinfo.format, sinfo.subtype),
)

@ -0,0 +1,101 @@
from pathlib import Path
from typing import Callable
from typing import Optional, Tuple, Union
import paddle
import paddleaudio
from paddle import Tensor
from .common import AudioInfo
import os
from paddleaudio._internal import module_utils as _mod_utils
#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py
def _fail_info(filepath: str, format: Optional[str]) -> AudioInfo:
raise RuntimeError("Failed to fetch metadata from {}".format(filepath))
def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioInfo:
raise RuntimeError("Failed to fetch metadata from {}".format(fileobj))
# Note: need to comply TorchScript syntax -- need annotation and no f-string
def _fail_load(
filepath: str,
frame_offset: int = 0,
num_frames: int = -1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str] = None,
) -> Tuple[Tensor, int]:
raise RuntimeError("Failed to load audio from {}".format(filepath))
def _fail_load_fileobj(fileobj, *args, **kwargs):
raise RuntimeError(f"Failed to load audio from {fileobj}")
_fallback_info = _fail_info
_fallback_info_fileobj = _fail_info_fileobj
_fallback_load = _fail_load
_fallback_load_filebj = _fail_load_fileobj
@_mod_utils.requires_sox()
def load(
filepath: str,
frame_offset: int = 0,
num_frames: int=-1,
normalize: bool = True,
channels_first: bool = True,
format: Optional[str]=None, ) -> Tuple[Tensor, int]:
if hasattr(filepath, "read"):
ret = paddleaudio._paddleaudio.load_audio_fileobj(
filepath, frame_offset, num_frames, normalize, channels_first, format
)
if ret is not None:
audio_tensor = paddle.to_tensor(ret[0])
return (audio_tensor, ret[1])
return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format)
filepath = os.fspath(filepath)
ret = paddleaudio._paddleaudio.sox_io_load_audio_file(
filepath, frame_offset, num_frames, normalize, channels_first, format
)
if ret is not None:
audio_tensor = paddle.to_tensor(ret[0])
return (audio_tensor, ret[1])
return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format)
@_mod_utils.requires_sox()
def save(filepath: str,
src: Tensor,
sample_rate: int,
channels_first: bool = True,
compression: Optional[float] = None,
format: Optional[str] = None,
encoding: Optional[str] = None,
bits_per_sample: Optional[int] = None,
):
src_arr = src.numpy()
if hasattr(filepath, "write"):
paddleaudio._paddleaudio.save_audio_fileobj(
filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
)
return
filepath = os.fspath(filepath)
paddleaudio._paddleaudio.sox_io_save_audio_file(
filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample
)
@_mod_utils.requires_sox()
def info(filepath: str, format: Optional[str] = None,) -> AudioInfo:
if hasattr(filepath, "read"):
sinfo = paddleaudio._paddleaudio.get_info_fileobj(filepath, format)
if sinfo is not None:
return AudioInfo(*sinfo)
return _fallback_info_fileobj(filepath, format)
filepath = os.fspath(filepath)
sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format)
if sinfo is not None:
return AudioInfo(*sinfo)
return _fallback_info(filepath, format)

@ -0,0 +1,81 @@
"""Defines utilities for switching audio backends"""
#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py
import warnings
from typing import List
from typing import Optional
import paddleaudio
from paddleaudio._internal import module_utils as _mod_utils
from . import no_backend, soundfile_backend, sox_io_backend
__all__ = [
"list_audio_backends",
"get_audio_backend",
"set_audio_backend",
]
def list_audio_backends() -> List[str]:
"""List available backends
Returns:
List[str]: The list of available backends.
"""
backends = []
if _mod_utils.is_module_available("soundfile"):
backends.append("soundfile")
if _mod_utils.is_sox_available():
backends.append("sox_io")
return backends
def set_audio_backend(backend: Optional[str]):
"""Set the backend for I/O operation
Args:
backend (str or None): Name of the backend.
One of ``"sox_io"`` or ``"soundfile"`` based on availability
of the system. If ``None`` is provided the current backend is unassigned.
"""
if backend is not None and backend not in list_audio_backends():
raise RuntimeError(f'Backend "{backend}" is not one of '
f"available backends: {list_audio_backends()}.")
if backend is None:
module = no_backend
elif backend == "sox_io":
module = sox_io_backend
elif backend == "soundfile":
module = soundfile_backend
else:
raise NotImplementedError(f'Unexpected backend "{backend}"')
for func in ["save", "load", "info"]:
setattr(paddleaudio, func, getattr(module, func))
def _init_audio_backend():
backends = list_audio_backends()
if "soundfile" in backends:
set_audio_backend("soundfile")
elif "sox_io" in backends:
set_audio_backend("sox_io")
else:
warnings.warn("No audio backend is available.")
set_audio_backend(None)
def get_audio_backend() -> Optional[str]:
"""Get the name of the current backend
Returns:
Optional[str]: The name of the current backend or ``None`` if no backend is assigned.
"""
if paddleaudio.load == no_backend.load:
return None
if paddleaudio.load == sox_io_backend.load:
return "sox_io"
if paddleaudio.load == soundfile_backend.load:
return "soundfile"
raise ValueError("Unknown backend.")

@ -0,0 +1,15 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import kaldi
from . import librosa

@ -0,0 +1,638 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from torchaudio(https://github.com/pytorch/audio)
import math
from typing import Tuple
import paddle
from paddle import Tensor
from ..functional import create_dct
from ..functional.window import get_window
__all__ = [
'spectrogram',
'fbank',
'mfcc',
]
# window types
HANNING = 'hann'
HAMMING = 'hamming'
POVEY = 'povey'
RECTANGULAR = 'rect'
BLACKMAN = 'blackman'
def _get_epsilon(dtype):
return paddle.to_tensor(1e-07, dtype=dtype)
def _next_power_of_2(x: int) -> int:
return 1 if x == 0 else 2**(x - 1).bit_length()
def _get_strided(waveform: Tensor,
window_size: int,
window_shift: int,
snip_edges: bool) -> Tensor:
assert waveform.dim() == 1
num_samples = waveform.shape[0]
if snip_edges:
if num_samples < window_size:
return paddle.empty((0, 0), dtype=waveform.dtype)
else:
m = 1 + (num_samples - window_size) // window_shift
else:
reversed_waveform = paddle.flip(waveform, [0])
m = (num_samples + (window_shift // 2)) // window_shift
pad = window_size // 2 - window_shift // 2
pad_right = reversed_waveform
if pad > 0:
pad_left = reversed_waveform[-pad:]
waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
else:
waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
def _feature_window_function(
window_type: str,
window_size: int,
blackman_coeff: float,
dtype: int, ) -> Tensor:
if window_type == HANNING:
return get_window('hann', window_size, fftbins=False, dtype=dtype)
elif window_type == HAMMING:
return get_window('hamming', window_size, fftbins=False, dtype=dtype)
elif window_type == POVEY:
return get_window(
'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
elif window_type == RECTANGULAR:
return paddle.ones([window_size], dtype=dtype)
elif window_type == BLACKMAN:
a = 2 * math.pi / (window_size - 1)
window_function = paddle.arange(window_size, dtype=dtype)
return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
(0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
).astype(dtype)
else:
raise Exception('Invalid window type ' + window_type)
def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
energy_floor: float) -> Tensor:
log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
if energy_floor == 0.0:
return log_energy
return paddle.maximum(
log_energy,
paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
def _get_waveform_and_window_properties(
waveform: Tensor,
channel: int,
sr: int,
frame_shift: float,
frame_length: float,
round_to_power_of_two: bool,
preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
channel = max(channel, 0)
assert channel < waveform.shape[0], (
'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
waveform = waveform[channel, :] # size (n)
window_shift = int(
sr * frame_shift *
0.001) # pass frame_shift and frame_length in milliseconds
window_size = int(sr * frame_length * 0.001)
padded_window_size = _next_power_of_2(
window_size) if round_to_power_of_two else window_size
assert 2 <= window_size <= len(waveform), (
'choose a window size {} that is [2, {}]'.format(window_size,
len(waveform)))
assert 0 < window_shift, '`window_shift` must be greater than 0'
assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
' use `round_to_power_of_two` or change `frame_length`'
assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
assert sr > 0, '`sr` must be greater than zero'
return waveform, window_shift, window_size, padded_window_size
def _get_window(waveform: Tensor,
padded_window_size: int,
window_size: int,
window_shift: int,
window_type: str,
blackman_coeff: float,
snip_edges: bool,
raw_energy: bool,
energy_floor: float,
dither: float,
remove_dc_offset: bool,
preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
dtype = waveform.dtype
epsilon = _get_epsilon(dtype)
# (m, window_size)
strided_input = _get_strided(waveform, window_size, window_shift,
snip_edges)
if dither != 0.0:
x = paddle.maximum(epsilon,
paddle.rand(strided_input.shape, dtype=dtype))
rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
strided_input = strided_input + rand_gauss * dither
if remove_dc_offset:
row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1)
strided_input = strided_input - row_means
if raw_energy:
signal_log_energy = _get_log_energy(strided_input, epsilon,
energy_floor) # (m)
if preemphasis_coefficient != 0.0:
offset_strided_input = paddle.nn.functional.pad(
strided_input.unsqueeze(0), (1, 0),
data_format='NCL',
mode='replicate').squeeze(0) # (m, window_size + 1)
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
-1]
window_function = _feature_window_function(
window_type, window_size, blackman_coeff,
dtype).unsqueeze(0) # (1, window_size)
strided_input = strided_input * window_function # (m, window_size)
# (m, padded_window_size)
if padded_window_size != window_size:
padding_right = padded_window_size - window_size
strided_input = paddle.nn.functional.pad(
strided_input.unsqueeze(0), (0, padding_right),
data_format='NCL',
mode='constant',
value=0).squeeze(0)
if not raw_energy:
signal_log_energy = _get_log_energy(strided_input, epsilon,
energy_floor) # size (m)
return strided_input, signal_log_energy
def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
if subtract_mean:
col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
tensor = tensor - col_means
return tensor
def spectrogram(waveform: Tensor,
blackman_coeff: float=0.42,
channel: int=-1,
dither: float=0.0,
energy_floor: float=1.0,
frame_length: float=25.0,
frame_shift: float=10.0,
preemphasis_coefficient: float=0.97,
raw_energy: bool=True,
remove_dc_offset: bool=True,
round_to_power_of_two: bool=True,
sr: int=16000,
snip_edges: bool=True,
subtract_mean: bool=False,
window_type: str=POVEY) -> Tensor:
"""Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
Args:
waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
depends on frame_length and frame_shift.
"""
dtype = waveform.dtype
epsilon = _get_epsilon(dtype)
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
preemphasis_coefficient)
strided_input, signal_log_energy = _get_window(
waveform, padded_window_size, window_size, window_shift, window_type,
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
remove_dc_offset, preemphasis_coefficient)
# (m, padded_window_size // 2 + 1, 2)
fft = paddle.fft.rfft(strided_input)
power_spectrum = paddle.maximum(
fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1)
power_spectrum[:, 0] = signal_log_energy
power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
return power_spectrum
def _inverse_mel_scale_scalar(mel_freq: float) -> float:
return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
def _mel_scale_scalar(freq: float) -> float:
return 1127.0 * math.log(1.0 + freq / 700.0)
def _mel_scale(freq: Tensor) -> Tensor:
return 1127.0 * (1.0 + freq / 700.0).log()
def _vtln_warp_freq(vtln_low_cutoff: float,
vtln_high_cutoff: float,
low_freq: float,
high_freq: float,
vtln_warp_factor: float,
freq: Tensor) -> Tensor:
assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
scale = 1.0 / vtln_warp_factor
Fl = scale * l
Fh = scale * h
assert l > low_freq and h < high_freq
scale_left = (Fl - low_freq) / (l - low_freq)
scale_right = (high_freq - Fh) / (high_freq - h)
res = paddle.empty_like(freq)
outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
| paddle.greater_than(freq, paddle.to_tensor(high_freq))
before_l = paddle.less_than(freq, paddle.to_tensor(l))
before_h = paddle.less_than(freq, paddle.to_tensor(h))
after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
res[before_h] = scale * freq[before_h]
res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
res[outside_low_high_freq] = freq[outside_low_high_freq]
return res
def _vtln_warp_mel_freq(vtln_low_cutoff: float,
vtln_high_cutoff: float,
low_freq,
high_freq: float,
vtln_warp_factor: float,
mel_freq: Tensor) -> Tensor:
return _mel_scale(
_vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
vtln_warp_factor, _inverse_mel_scale(mel_freq)))
def _get_mel_banks(num_bins: int,
window_length_padded: int,
sample_freq: float,
low_freq: float,
high_freq: float,
vtln_low: float,
vtln_high: float,
vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
assert num_bins > 3, 'Must have at least 3 mel bins'
assert window_length_padded % 2 == 0
num_fft_bins = window_length_padded / 2
nyquist = 0.5 * sample_freq
if high_freq <= 0.0:
high_freq += nyquist
assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
fft_bin_width = sample_freq / window_length_padded
mel_low_freq = _mel_scale_scalar(low_freq)
mel_high_freq = _mel_scale_scalar(high_freq)
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
if vtln_high < 0.0:
vtln_high += nyquist
assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
(0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
('Bad values in options: vtln-low {} and vtln-high {}, versus '
'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
bin = paddle.arange(num_bins).unsqueeze(1)
left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1)
center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1)
right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1)
if vtln_warp_factor != 1.0:
left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
vtln_warp_factor, left_mel)
center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
high_freq, vtln_warp_factor,
center_mel)
right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
high_freq, vtln_warp_factor, right_mel)
center_freqs = _inverse_mel_scale(center_mel) # (num_bins)
# (1, num_fft_bins)
mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
# (num_bins, num_fft_bins)
up_slope = (mel - left_mel) / (center_mel - left_mel)
down_slope = (right_mel - mel) / (right_mel - center_mel)
if vtln_warp_factor == 1.0:
bins = paddle.maximum(
paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
else:
bins = paddle.zeros_like(up_slope)
up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
mel, center_mel)
down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
mel, right_mel)
bins[up_idx] = up_slope[up_idx]
bins[down_idx] = down_slope[down_idx]
return bins, center_freqs
def fbank(waveform: Tensor,
blackman_coeff: float=0.42,
channel: int=-1,
dither: float=0.0,
energy_floor: float=1.0,
frame_length: float=25.0,
frame_shift: float=10.0,
high_freq: float=0.0,
htk_compat: bool=False,
low_freq: float=20.0,
n_mels: int=23,
preemphasis_coefficient: float=0.97,
raw_energy: bool=True,
remove_dc_offset: bool=True,
round_to_power_of_two: bool=True,
sr: int=16000,
snip_edges: bool=True,
subtract_mean: bool=False,
use_energy: bool=False,
use_log_fbank: bool=True,
use_power: bool=True,
vtln_high: float=-500.0,
vtln_low: float=100.0,
vtln_warp: float=1.0,
window_type: str=POVEY) -> Tensor:
"""Compute and return filter banks from a waveform. The output is identical to Kaldi's.
Args:
waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
n_mels (int, optional): Number of output mel bins. Defaults to 23.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor: A filter banks tensor with shape `(m, n_mels)`.
"""
dtype = waveform.dtype
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
preemphasis_coefficient)
strided_input, signal_log_energy = _get_window(
waveform, padded_window_size, window_size, window_shift, window_type,
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
remove_dc_offset, preemphasis_coefficient)
# (m, padded_window_size // 2 + 1)
spectrum = paddle.fft.rfft(strided_input).abs()
if use_power:
spectrum = spectrum.pow(2.)
# (n_mels, padded_window_size // 2)
mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
high_freq, vtln_low, vtln_high, vtln_warp)
mel_energies = mel_energies.astype(dtype)
# (n_mels, padded_window_size // 2 + 1)
mel_energies = paddle.nn.functional.pad(
mel_energies.unsqueeze(0), (0, 1),
data_format='NCL',
mode='constant',
value=0).squeeze(0)
# (m, n_mels)
mel_energies = paddle.mm(spectrum, mel_energies.T)
if use_log_fbank:
mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
if use_energy:
signal_log_energy = signal_log_energy.unsqueeze(1)
if htk_compat:
mel_energies = paddle.concat(
(mel_energies, signal_log_energy), axis=1)
else:
mel_energies = paddle.concat(
(signal_log_energy, mel_energies), axis=1)
# (m, n_mels + 1)
mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
return mel_energies
def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
dct_matrix = create_dct(n_mels, n_mels, 'ortho')
dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc)
return dct_matrix
def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
i = paddle.arange(n_mfcc)
return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
cepstral_lifter)
def mfcc(waveform: Tensor,
blackman_coeff: float=0.42,
cepstral_lifter: float=22.0,
channel: int=-1,
dither: float=0.0,
energy_floor: float=1.0,
frame_length: float=25.0,
frame_shift: float=10.0,
high_freq: float=0.0,
htk_compat: bool=False,
low_freq: float=20.0,
n_mfcc: int=13,
n_mels: int=23,
preemphasis_coefficient: float=0.97,
raw_energy: bool=True,
remove_dc_offset: bool=True,
round_to_power_of_two: bool=True,
sr: int=16000,
snip_edges: bool=True,
subtract_mean: bool=False,
use_energy: bool=False,
vtln_high: float=-500.0,
vtln_low: float=100.0,
vtln_warp: float=1.0,
window_type: str=POVEY) -> Tensor:
"""Compute and return mel frequency cepstral coefficients from a waveform. The output is
identical to Kaldi's.
Args:
waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
n_mels (int, optional): Number of output mel bins. Defaults to 23.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
"""
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
n_mfcc, n_mels)
dtype = waveform.dtype
# (m, n_mels + use_energy)
feature = fbank(
waveform=waveform,
blackman_coeff=blackman_coeff,
channel=channel,
dither=dither,
energy_floor=energy_floor,
frame_length=frame_length,
frame_shift=frame_shift,
high_freq=high_freq,
htk_compat=htk_compat,
low_freq=low_freq,
n_mels=n_mels,
preemphasis_coefficient=preemphasis_coefficient,
raw_energy=raw_energy,
remove_dc_offset=remove_dc_offset,
round_to_power_of_two=round_to_power_of_two,
sr=sr,
snip_edges=snip_edges,
subtract_mean=False,
use_energy=use_energy,
use_log_fbank=True,
use_power=True,
vtln_high=vtln_high,
vtln_low=vtln_low,
vtln_warp=vtln_warp,
window_type=window_type)
if use_energy:
# (m)
signal_log_energy = feature[:, n_mels if htk_compat else 0]
mel_offset = int(not htk_compat)
feature = feature[:, mel_offset:(n_mels + mel_offset)]
# (n_mels, n_mfcc)
dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
# (m, n_mfcc)
feature = feature.matmul(dct_matrix)
if cepstral_lifter != 0.0:
# (1, n_mfcc)
lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
feature *= lifter_coeffs.astype(dtype=dtype)
if use_energy:
feature[:, 0] = signal_log_energy
if htk_compat:
energy = feature[:, 0].unsqueeze(1) # (m, 1)
feature = feature[:, 1:] # (m, n_mfcc - 1)
if not use_energy:
energy *= math.sqrt(2)
feature = paddle.concat((feature, energy), axis=1)
feature = _subtract_column_mean(feature, subtract_mean)
return feature

@ -0,0 +1,788 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from librosa(https://github.com/librosa/librosa)
import warnings
from typing import List
from typing import Optional
from typing import Union
import numpy as np
import scipy
from numpy.lib.stride_tricks import as_strided
from scipy import signal
from ..backends import depth_convert
from ..utils import ParameterError
__all__ = [
# dsp
'stft',
'mfcc',
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'power_to_db',
'compute_fbank_matrix',
'melspectrogram',
'spectrogram',
'mu_encode',
'mu_decode',
# augmentation
'depth_augment',
'spect_augment',
'random_crop1d',
'random_crop2d',
'adaptive_spect_augment',
]
def _pad_center(data: np.ndarray, size: int, axis: int=-1,
**kwargs) -> np.ndarray:
"""Pad an array to a target length along a target axis.
This differs from `np.pad` by centering the data prior to padding,
analogous to `str.center`
"""
kwargs.setdefault("mode", "constant")
n = data.shape[axis]
lpad = int((size - n) // 2)
lengths = [(0, 0)] * data.ndim
lengths[axis] = (lpad, int(size - n - lpad))
if lpad < 0:
raise ParameterError(("Target size ({size:d}) must be "
"at least input size ({n:d})"))
return np.pad(data, lengths, **kwargs)
def _split_frames(x: np.ndarray,
frame_length: int,
hop_length: int,
axis: int=-1) -> np.ndarray:
"""Slice a data array into (overlapping) frames.
This function is aligned with librosa.frame
"""
if not isinstance(x, np.ndarray):
raise ParameterError(
f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
if x.shape[axis] < frame_length:
raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
f" for frame_length={frame_length:d}")
if hop_length < 1:
raise ParameterError(f"Invalid hop_length: {hop_length:d}")
if axis == -1 and not x.flags["F_CONTIGUOUS"]:
warnings.warn(f"librosa.util.frame called with axis={axis} "
"on a non-contiguous input. This will result in a copy.")
x = np.asfortranarray(x)
elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
warnings.warn(f"librosa.util.frame called with axis={axis} "
"on a non-contiguous input. This will result in a copy.")
x = np.ascontiguousarray(x)
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
strides = np.asarray(x.strides)
new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
if axis == -1:
shape = list(x.shape)[:-1] + [frame_length, n_frames]
strides = list(strides) + [hop_length * new_stride]
elif axis == 0:
shape = [n_frames, frame_length] + list(x.shape)[1:]
strides = [hop_length * new_stride] + list(strides)
else:
raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
return as_strided(x, shape=shape, strides=strides)
def _check_audio(y, mono=True) -> bool:
"""Determine whether a variable contains valid audio data.
The audio y must be a np.ndarray, ether 1-channel or two channel
"""
if not isinstance(y, np.ndarray):
raise ParameterError("Audio data must be of type numpy.ndarray")
if y.ndim > 2:
raise ParameterError(
f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
if mono and y.ndim == 2:
raise ParameterError(
f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
if not np.issubdtype(y.dtype, np.floating):
raise ParameterError("Audio data must be floating-point")
if not np.isfinite(y).all():
raise ParameterError("Audio buffer is not finite everywhere")
return True
def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
htk: bool=False) -> np.ndarray:
"""Convert Hz to Mels.
Args:
frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
np.ndarray: Frequency in mels.
"""
freq = np.asanyarray(frequencies)
if htk:
return 2595.0 * np.log10(1.0 + freq / 700.0)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (freq - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if freq.ndim:
# If we have array data, vectorize
log_t = freq >= min_log_hz
mels[log_t] = min_log_mel + \
np.log(freq[log_t] / min_log_hz) / logstep
elif freq >= min_log_hz:
# If we have scalar data, heck directly
mels = min_log_mel + np.log(freq / min_log_hz) / logstep
return mels
def mel_to_hz(mels: Union[float, List[float], np.ndarray],
htk: int=False) -> np.ndarray:
"""Convert mel bin numbers to frequencies.
Args:
mels (Union[float, List[float], np.ndarray]): Frequency in mels.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
np.ndarray: Frequencies in Hz.
"""
mel_array = np.asanyarray(mels)
if htk:
return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mel_array
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if mel_array.ndim:
# If we have vector data, vectorize
log_t = mel_array >= min_log_mel
freqs[log_t] = min_log_hz * \
np.exp(logstep * (mel_array[log_t] - min_log_mel))
elif mel_array >= min_log_mel:
# If we have scalar data, check directly
freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
return freqs
def mel_frequencies(n_mels: int=128,
fmin: float=0.0,
fmax: float=11025.0,
htk: bool=False) -> np.ndarray:
"""Compute mel frequencies.
Args:
n_mels (int, optional): Number of mel bins. Defaults to 128.
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(fmin, htk=htk)
max_mel = hz_to_mel(fmax, htk=htk)
mels = np.linspace(min_mel, max_mel, n_mels)
return mel_to_hz(mels, htk=htk)
def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
"""Compute fourier frequencies.
Args:
sr (int): Sample rate.
n_fft (int): FFT size.
Returns:
np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
"""
return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
def compute_fbank_matrix(sr: int,
n_fft: int,
n_mels: int=128,
fmin: float=0.0,
fmax: Optional[float]=None,
htk: bool=False,
norm: str="slaney",
dtype: type=np.float32) -> np.ndarray:
"""Compute fbank matrix.
Args:
sr (int): Sample rate.
n_fft (int): FFT size.
n_mels (int, optional): Number of mel bins. Defaults to 128.
fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use htk scaling. Defaults to False.
norm (str, optional): Type of normalization. Defaults to "slaney".
dtype (type, optional): Data type. Defaults to np.float32.
Returns:
np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
"""
if norm != "slaney":
raise ParameterError('norm must set to slaney')
if fmax is None:
fmax = float(sr) / 2
# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
if norm == "slaney":
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]
# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
# This means we have an empty channel somewhere
warnings.warn("Empty filters detected in mel frequency basis. "
"Some channels will produce empty responses. "
"Try increasing your sampling rate (and fmax) or "
"reducing n_mels.")
return weights
def stft(x: np.ndarray,
n_fft: int=2048,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str="hann",
center: bool=True,
dtype: type=np.complex64,
pad_mode: str="reflect") -> np.ndarray:
"""Short-time Fourier transform (STFT).
Args:
x (np.ndarray): Input waveform in one dimension.
n_fft (int, optional): FFT size. Defaults to 2048.
hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
win_length (Optional[int], optional): The size of window. Defaults to None.
window (str, optional): A string of window specification. Defaults to "hann".
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
Returns:
np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
"""
_check_audio(x)
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
fft_window = signal.get_window(window, win_length, fftbins=True)
# Pad the window out to n_fft size
fft_window = _pad_center(fft_window, n_fft)
# Reshape so that the window can be broadcast
fft_window = fft_window.reshape((-1, 1))
# Pad the time series so that frames are centered
if center:
if n_fft > x.shape[-1]:
warnings.warn(
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
)
x = np.pad(x, int(n_fft // 2), mode=pad_mode)
elif n_fft > x.shape[-1]:
raise ParameterError(
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
)
# Window the time series.
x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
# Pre-allocate the STFT matrix
stft_matrix = np.empty(
(int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
fft = np.fft # use numpy fft as default
# Constrain STFT block sizes to 256 KB
MAX_MEM_BLOCK = 2**8 * 2**10
# how many columns can we fit within MAX_MEM_BLOCK?
n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
n_columns = max(n_columns, 1)
for bl_s in range(0, stft_matrix.shape[1], n_columns):
bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
stft_matrix[:, bl_s:bl_t] = fft.rfft(
fft_window * x_frames[:, bl_s:bl_t], axis=0)
return stft_matrix
def power_to_db(spect: np.ndarray,
ref: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=80.0) -> np.ndarray:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
Args:
spect (np.ndarray): STFT power spectrogram of an input waveform.
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): Minimum threshold. Defaults to 1e-10.
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
Returns:
np.ndarray: Power spectrogram in db scale.
"""
spect = np.asarray(spect)
if amin <= 0:
raise ParameterError("amin must be strictly positive")
if np.issubdtype(spect.dtype, np.complexfloating):
warnings.warn(
"power_to_db was called on complex input so phase "
"information will be discarded. To suppress this warning, "
"call power_to_db(np.abs(D)**2) instead.")
magnitude = np.abs(spect)
else:
magnitude = spect
if callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
if top_db is not None:
if top_db < 0:
raise ParameterError("top_db must be non-negative")
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
return log_spec
def mfcc(x: np.ndarray,
sr: int=16000,
spect: Optional[np.ndarray]=None,
n_mfcc: int=20,
dct_type: int=2,
norm: str="ortho",
lifter: int=0,
**kwargs) -> np.ndarray:
"""Mel-frequency cepstral coefficients (MFCCs)
Args:
x (np.ndarray): Input waveform in one dimension.
sr (int, optional): Sample rate. Defaults to 16000.
spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
norm (str, optional): Type of normalization. Defaults to "ortho".
lifter (int, optional): Cepstral filtering. Defaults to 0.
Returns:
np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
"""
if spect is None:
spect = melspectrogram(x, sr=sr, **kwargs)
M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
if lifter > 0:
factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
lifter)
return M * factor[:, np.newaxis]
elif lifter == 0:
return M
else:
raise ParameterError(
f"MFCC lifter={lifter} must be a non-negative number")
def melspectrogram(x: np.ndarray,
sr: int=16000,
window_size: int=512,
hop_length: int=320,
n_mels: int=64,
fmin: float=50.0,
fmax: Optional[float]=None,
window: str='hann',
center: bool=True,
pad_mode: str='reflect',
power: float=2.0,
to_db: bool=True,
ref: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None) -> np.ndarray:
"""Compute mel-spectrogram.
Args:
x (np.ndarray): Input waveform in one dimension.
sr (int, optional): Sample rate. Defaults to 16000.
window_size (int, optional): Size of FFT and window length. Defaults to 512.
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
n_mels (int, optional): Number of mel bins. Defaults to 64.
fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
window (str, optional): A string of window specification. Defaults to "hann".
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
to_db (bool, optional): Enable db scale. Defaults to True.
ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): Minimum threshold. Defaults to 1e-10.
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
Returns:
np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
"""
_check_audio(x, mono=True)
if len(x) <= 0:
raise ParameterError('The input waveform is empty')
if fmax is None:
fmax = sr // 2
if fmin < 0 or fmin >= fmax:
raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
s = stft(
x,
n_fft=window_size,
hop_length=hop_length,
win_length=window_size,
window=window,
center=center,
pad_mode=pad_mode)
spect_power = np.abs(s)**power
fb_matrix = compute_fbank_matrix(
sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
mel_spect = np.matmul(fb_matrix, spect_power)
if to_db:
return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
else:
return mel_spect
def spectrogram(x: np.ndarray,
sr: int=16000,
window_size: int=512,
hop_length: int=320,
window: str='hann',
center: bool=True,
pad_mode: str='reflect',
power: float=2.0) -> np.ndarray:
"""Compute spectrogram.
Args:
x (np.ndarray): Input waveform in one dimension.
sr (int, optional): Sample rate. Defaults to 16000.
window_size (int, optional): Size of FFT and window length. Defaults to 512.
hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
window (str, optional): A string of window specification. Defaults to "hann".
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
Returns:
np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
"""
s = stft(
x,
n_fft=window_size,
hop_length=hop_length,
win_length=window_size,
window=window,
center=center,
pad_mode=pad_mode)
return np.abs(s)**power
def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
"""Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
Args:
x (np.ndarray): The input waveform to encode.
mu (int, optional): The endoceding parameter. Defaults to 255.
quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
Returns:
np.ndarray: The mu-law encoded waveform.
"""
mu = 255
y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
if quantized:
y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1]
return y
def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
"""Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
Args:
y (np.ndarray): The encoded waveform.
mu (int, optional): The endoceding parameter. Defaults to 255.
quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
Returns:
np.ndarray: The mu-law decoded waveform.
"""
if mu < 1:
raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
mu = mu - 1
if quantized: # undo the quantization
y = y * 2 / mu - 1
x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
return x
def _randint(high: int) -> int:
"""Generate one random integer in range [0 high)
This is a helper function for random data augmentaiton
"""
return int(np.random.randint(0, high=high))
def depth_augment(y: np.ndarray,
choices: List=['int8', 'int16'],
probs: List[float]=[0.5, 0.5]) -> np.ndarray:
""" Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
Returns:
np.ndarray: The augmented waveform.
"""
assert len(probs) == len(
choices
), 'number of choices {} must be equal to size of probs {}'.format(
len(choices), len(probs))
depth = np.random.choice(choices, p=probs)
src_depth = y.dtype
y1 = depth_convert(y, depth)
y2 = depth_convert(y1, src_depth)
return y2
def adaptive_spect_augment(spect: np.ndarray,
tempo_axis: int=0,
level: float=0.1) -> np.ndarray:
"""Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
Args:
spect (np.ndarray): Input spectrogram.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
level (float, optional): The level factor of masking. Defaults to 0.1.
Returns:
np.ndarray: The augmented spectrogram.
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
time_mask_width = int(nt * level * 0.5)
freq_mask_width = int(nf * level * 0.5)
num_time_mask = int(10 * level)
num_freq_mask = int(10 * level)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def spect_augment(spect: np.ndarray,
tempo_axis: int=0,
max_time_mask: int=3,
max_freq_mask: int=3,
max_time_mask_width: int=30,
max_freq_mask_width: int=20) -> np.ndarray:
"""Do spectrogram augmentation in both time and freq axis.
Args:
spect (np.ndarray): Input spectrogram.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
Returns:
np.ndarray: The augmented spectrogram.
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
num_time_mask = _randint(max_time_mask)
num_freq_mask = _randint(max_freq_mask)
time_mask_width = _randint(max_time_mask_width)
freq_mask_width = _randint(max_freq_mask_width)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = _randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = _randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
""" Random cropping on a input waveform.
Args:
y (np.ndarray): Input waveform array in 1D.
crop_len (int): Length of waveform to crop.
Returns:
np.ndarray: The cropped waveform.
"""
if y.ndim != 1:
'only accept 1d tensor or numpy array'
n = len(y)
idx = _randint(n - crop_len)
return y[idx:idx + crop_len]
def random_crop2d(s: np.ndarray, crop_len: int,
tempo_axis: int=0) -> np.ndarray:
""" Random cropping on a spectrogram.
Args:
s (np.ndarray): Input spectrogram in 2D.
crop_len (int): Length of spectrogram to crop.
tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
Returns:
np.ndarray: The cropped spectrogram.
"""
if tempo_axis >= s.ndim:
raise ParameterError('axis out of range')
n = s.shape[tempo_axis]
idx = _randint(high=n - crop_len)
sli = [slice(None) for i in range(s.ndim)]
sli[tempo_axis] = slice(idx, idx + crop_len)
out = s[tuple(sli)]
return out

@ -0,0 +1,20 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .esc50 import ESC50
from .gtzan import GTZAN
from .hey_snips import HeySnips
from .rirs_noises import OpenRIRNoise
from .tess import TESS
from .urban_sound import UrbanSound8K
from .voxceleb import VoxCeleb

@ -0,0 +1,100 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np
import paddle
from ..backends.soundfile_backend import soundfile_load as load_audio
from ..compliance.kaldi import fbank as kaldi_fbank
from ..compliance.kaldi import mfcc as kaldi_mfcc
from ..compliance.librosa import melspectrogram
from ..compliance.librosa import mfcc
feat_funcs = {
'raw': None,
'melspectrogram': melspectrogram,
'mfcc': mfcc,
'kaldi_fbank': kaldi_fbank,
'kaldi_mfcc': kaldi_mfcc,
}
class AudioClassificationDataset(paddle.io.Dataset):
"""
Base class of audio classification dataset.
"""
def __init__(self,
files: List[str],
labels: List[int],
feat_type: str='raw',
sample_rate: int=None,
**kwargs):
"""
Ags:
files (:obj:`List[str]`): A list of absolute path of audio files.
labels (:obj:`List[int]`): Labels of audio files.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
super(AudioClassificationDataset, self).__init__()
if feat_type not in feat_funcs.keys():
raise RuntimeError(
f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
)
self.files = files
self.labels = labels
self.feat_type = feat_type
self.sample_rate = sample_rate
self.feat_config = kwargs # Pass keyword arguments to customize feature config
def _get_data(self, input_file: str):
raise NotImplementedError
def _convert_to_record(self, idx):
file, label = self.files[idx], self.labels[idx]
if self.sample_rate is None:
waveform, sample_rate = load_audio(file)
else:
waveform, sample_rate = load_audio(file, sr=self.sample_rate)
feat_func = feat_funcs[self.feat_type]
record = {}
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T)
record['feat'] = feat_func(
waveform=waveform, sr=self.sample_rate, **self.feat_config)
else:
record['feat'] = feat_func(
waveform, sample_rate,
**self.feat_config) if feat_func else waveform
record['label'] = label
return record
def __getitem__(self, idx):
record = self._convert_to_record(idx)
if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
return self.keys[idx], record['feat'], record['label']
else:
return np.array(record['feat']).transpose(), np.array(
record['label'], dtype=np.int64)
def __len__(self):
return len(self.files)

@ -0,0 +1,152 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['ESC50']
class ESC50(AudioClassificationDataset):
"""
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
suitable for benchmarking methods of environmental sound classification. The dataset
consists of 5-second-long recordings organized into 50 semantical classes (with
40 examples per class)
Reference:
ESC: Dataset for Environmental Sound Classification
http://dx.doi.org/10.1145/2733373.2806390
"""
archieves = [
{
'url':
'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
'md5': '7771e4b9d86d0945acce719c7a59305a',
},
]
label_list = [
# Animals
'Dog',
'Rooster',
'Pig',
'Cow',
'Frog',
'Cat',
'Hen',
'Insects (flying)',
'Sheep',
'Crow',
# Natural soundscapes & water sounds
'Rain',
'Sea waves',
'Crackling fire',
'Crickets',
'Chirping birds',
'Water drops',
'Wind',
'Pouring water',
'Toilet flush',
'Thunderstorm',
# Human, non-speech sounds
'Crying baby',
'Sneezing',
'Clapping',
'Breathing',
'Coughing',
'Footsteps',
'Laughing',
'Brushing teeth',
'Snoring',
'Drinking, sipping',
# Interior/domestic sounds
'Door knock',
'Mouse click',
'Keyboard typing',
'Door, wood creaks',
'Can opening',
'Washing machine',
'Vacuum cleaner',
'Clock alarm',
'Clock tick',
'Glass breaking',
# Exterior/urban noises
'Helicopter',
'Chainsaw',
'Siren',
'Car horn',
'Engine',
'Train',
'Church bells',
'Airplane',
'Fireworks',
'Hand saw',
]
meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
meta_info = collections.namedtuple(
'META_INFO',
('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
audio_path = os.path.join('ESC-50-master', 'audio')
def __init__(self,
mode: str='train',
split: int=1,
feat_type: str='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
files, labels = self._get_data(mode, split)
super(ESC50, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines()[1:]:
ret.append(self.meta_info(*line.strip().split(',')))
return ret
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info()
files = []
labels = []
for sample in meta_info:
filename, fold, target, _, _, _, _ = sample
if mode == 'train' and int(fold) != split:
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
if mode != 'train' and int(fold) == split:
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
return files, labels

@ -0,0 +1,115 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['GTZAN']
class GTZAN(AudioClassificationDataset):
"""
The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
in machine listening research for music genre recognition (MGR).
Reference:
Musical genre classification of audio signals
https://ieeexplore.ieee.org/document/1021072/
"""
archieves = [
{
'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
},
]
label_list = [
'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
'pop', 'reggae', 'rock'
]
meta = os.path.join('genres', 'input.mf')
meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
audio_path = 'genres'
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
seed (:obj:`int`, `optional`, defaults to 0):
Set the random seed to shuffle samples.
n_folds (:obj:`int`, `optional`, defaults to 5):
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(GTZAN, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines():
ret.append(self.meta_info(*line.strip().split('\t')))
return ret
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info()
random.seed(seed) # shuffle samples to split data
random.shuffle(
meta_info
) # make sure using the same seed to create train and dev dataset
files = []
labels = []
n_samples_per_fold = len(meta_info) // n_folds
for idx, sample in enumerate(meta_info):
file_path, label = sample
filename = os.path.basename(file_path)
target = self.label_list.index(label)
fold = idx // n_samples_per_fold + 1
if mode == 'train' and int(fold) != split:
files.append(
os.path.join(DATA_HOME, self.audio_path, label, filename))
labels.append(target)
if mode != 'train' and int(fold) == split:
files.append(
os.path.join(DATA_HOME, self.audio_path, label, filename))
labels.append(target)
return files, labels

@ -0,0 +1,74 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import json
import os
from typing import List
from typing import Tuple
from .dataset import AudioClassificationDataset
__all__ = ['HeySnips']
class HeySnips(AudioClassificationDataset):
meta_info = collections.namedtuple('META_INFO',
('key', 'label', 'duration', 'wav'))
def __init__(self,
data_dir: os.PathLike,
mode: str='train',
feat_type: str='kaldi_fbank',
sample_rate: int=16000,
**kwargs):
self.data_dir = data_dir
files, labels = self._get_data(mode)
super(HeySnips, self).__init__(
files=files,
labels=labels,
feat_type=feat_type,
sample_rate=sample_rate,
**kwargs)
def _get_meta_info(self, mode) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(self.data_dir, '{}.json'.format(mode)),
'r') as f:
data = json.load(f)
for item in data:
sample = collections.OrderedDict()
if item['duration'] > 0:
sample['key'] = item['id']
sample['label'] = 0 if item['is_hotword'] == 1 else -1
sample['duration'] = item['duration']
sample['wav'] = os.path.join(self.data_dir,
item['audio_file_path'])
ret.append(self.meta_info(*sample.values()))
return ret
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
meta_info = self._get_meta_info(mode)
files = []
labels = []
self.keys = []
self.durations = []
for sample in meta_info:
key, target, duration, wav = sample
files.append(wav)
labels.append(int(target))
self.keys.append(key)
self.durations.append(float(duration))
return files, labels

@ -0,0 +1,201 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import csv
import os
import random
from typing import List
from paddle.io import Dataset
from tqdm import tqdm
from ..backends.soundfile_backend import soundfile_load as load_audio
from ..backends.soundfile_backend import soundfile_save as save_wav
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from .dataset import feat_funcs
__all__ = ['OpenRIRNoise']
class OpenRIRNoise(Dataset):
archieves = [
{
'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
'md5': 'e6f48e257286e05de56413b4779d8ffb',
},
]
sample_rate = 16000
meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
base_path = os.path.join(DATA_HOME, 'open_rir_noise')
wav_path = os.path.join(base_path, 'RIRS_NOISES')
csv_path = os.path.join(base_path, 'csv')
subsets = ['rir', 'noise']
def __init__(self,
subset: str='rir',
feat_type: str='raw',
target_dir=None,
random_chunk: bool=True,
chunk_duration: float=3.0,
seed: int=0,
**kwargs):
assert subset in self.subsets, \
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
self.subset = subset
self.feat_type = feat_type
self.feat_config = kwargs
self.random_chunk = random_chunk
self.chunk_duration = chunk_duration
OpenRIRNoise.csv_path = os.path.join(
target_dir, "open_rir_noise",
"csv") if target_dir else self.csv_path
self._data = self._get_data()
super(OpenRIRNoise, self).__init__()
# Set up a seed to reproduce training or predicting result.
# random.seed(seed)
def _get_data(self):
# Download audio files.
print(f"rirs noises base path: {self.base_path}")
if not os.path.isdir(self.base_path):
download_and_decompress(
self.archieves, self.base_path, decompress=True)
else:
print(
f"{self.base_path} already exists, we will not download and decompress again"
)
# Data preparation.
print(f"prepare the csv to {self.csv_path}")
if not os.path.isdir(self.csv_path):
os.makedirs(self.csv_path)
self.prepare_data()
data = []
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
for line in rf.readlines()[1:]:
audio_id, duration, wav = line.strip().split(',')
data.append(self.meta_info(audio_id, float(duration), wav))
random.shuffle(data)
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = load_audio(record['wav'])
assert self.feat_type in feat_funcs.keys(), \
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
feat_func = feat_funcs[self.feat_type]
feat = feat_func(
waveform, sr=sr, **self.feat_config) if feat_func else waveform
record.update({'feat': feat})
return record
@staticmethod
def _get_chunks(seg_dur, audio_id, audio_duration):
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
chunk_lst = [
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
for i in range(num_chunks)
]
return chunk_lst
def _get_audio_info(self, wav_file: str,
split_chunks: bool) -> List[List[str]]:
waveform, sr = load_audio(wav_file)
audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
audio_duration = waveform.shape[0] / sr
ret = []
if split_chunks and audio_duration > self.chunk_duration: # Split into pieces of self.chunk_duration seconds.
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
audio_duration)
for idx, chunk in enumerate(uniq_chunks_list):
s, e = chunk.split("_")[-2:] # Timestamps of start and end
start_sample = int(float(s) * sr)
end_sample = int(float(e) * sr)
new_wav_file = os.path.join(self.base_path,
audio_id + f'_chunk_{idx+1:02}.wav')
save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
# id, duration, new_wav
ret.append([chunk, self.chunk_duration, new_wav_file])
else: # Keep whole audio.
ret.append([audio_id, audio_duration, wav_file])
return ret
def generate_csv(self,
wav_files: List[str],
output_file: str,
split_chunks: bool=True):
print(f'Generating csv: {output_file}')
header = ["id", "duration", "wav"]
infos = list(
tqdm(
map(self._get_audio_info, wav_files, [split_chunks] * len(
wav_files)),
total=len(wav_files)))
csv_lines = []
for info in infos:
csv_lines.extend(info)
with open(output_file, mode="w") as csv_f:
csv_writer = csv.writer(
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(header)
for line in csv_lines:
csv_writer.writerow(line)
def prepare_data(self):
rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
"rir_list")
rir_files = []
with open(rir_list, 'r') as f:
for line in f.readlines():
rir_file = line.strip().split(' ')[-1]
rir_files.append(os.path.join(self.base_path, rir_file))
noise_list = os.path.join(self.wav_path, "pointsource_noises",
"noise_list")
noise_files = []
with open(noise_list, 'r') as f:
for line in f.readlines():
noise_file = line.strip().split(' ')[-1]
noise_files.append(os.path.join(self.base_path, noise_file))
self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
def __getitem__(self, idx):
return self._convert_to_record(idx)
def __len__(self):
return len(self._data)

@ -0,0 +1,126 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['TESS']
class TESS(AudioClassificationDataset):
"""
TESS is a set of 200 target words were spoken in the carrier phrase
"Say the word _____' by two actresses (aged 26 and 64 years) and
recordings were made of the set portraying each of seven emotions(anger,
disgust, fear, happiness, pleasant surprise, sadness, and neutral).
There are 2800 stimuli in total.
Reference:
Toronto emotional speech set (TESS)
https://doi.org/10.5683/SP2/E8H2MF
"""
archieves = [
{
'url':
'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
'md5':
'1465311b24d1de704c4c63e4ccc470c7',
},
]
label_list = [
'angry',
'disgust',
'fear',
'happy',
'neutral',
'ps', # pleasant surprise
'sad',
]
meta_info = collections.namedtuple('META_INFO',
('speaker', 'word', 'emotion'))
audio_path = 'TESS_Toronto_emotional_speech_set'
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
seed (:obj:`int`, `optional`, defaults to 0):
Set the random seed to shuffle samples.
n_folds (:obj:`int`, `optional`, defaults to 5):
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(TESS, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, files) -> List[collections.namedtuple]:
ret = []
for file in files:
basename_without_extend = os.path.basename(file)[:-4]
ret.append(self.meta_info(*basename_without_extend.split('_')))
return ret
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
download_and_decompress(self.archieves, DATA_HOME)
wav_files = []
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
for file in files:
if file.endswith('.wav'):
wav_files.append(os.path.join(root, file))
random.seed(seed) # shuffle samples to split data
random.shuffle(
wav_files
) # make sure using the same seed to create train and dev dataset
meta_info = self._get_meta_info(wav_files)
files = []
labels = []
n_samples_per_fold = len(meta_info) // n_folds
for idx, sample in enumerate(meta_info):
_, _, emotion = sample
target = self.label_list.index(emotion)
fold = idx // n_samples_per_fold + 1
if mode == 'train' and int(fold) != split:
files.append(wav_files[idx])
labels.append(target)
if mode != 'train' and int(fold) == split:
files.append(wav_files[idx])
labels.append(target)
return files, labels

@ -0,0 +1,104 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['UrbanSound8K']
class UrbanSound8K(AudioClassificationDataset):
"""
UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
classes are drawn from the urban sound taxonomy.
Reference:
A Dataset and Taxonomy for Urban Sound Research
https://dl.acm.org/doi/10.1145/2647868.2655045
"""
archieves = [
{
'url':
'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
'md5': '9aa69802bbf37fb986f71ec1483a196e',
},
]
label_list = [
"air_conditioner", "car_horn", "children_playing", "dog_bark",
"drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
"street_music"
]
meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
meta_info = collections.namedtuple(
'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
'class_id', 'label'))
audio_path = os.path.join('UrbanSound8K', 'audio')
def __init__(self,
mode: str='train',
split: int=1,
feat_type: str='raw',
**kwargs):
files, labels = self._get_data(mode, split)
super(UrbanSound8K, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
def _get_meta_info(self):
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines()[1:]:
ret.append(self.meta_info(*line.strip().split(',')))
return ret
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info()
files = []
labels = []
for sample in meta_info:
filename, _, _, _, _, fold, target, _ = sample
if mode == 'train' and int(fold) != split:
files.append(
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
filename))
labels.append(int(target))
if mode != 'train' and int(fold) == split:
files.append(
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
filename))
labels.append(int(target))
return files, labels

@ -0,0 +1,356 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import csv
import glob
import os
import random
from multiprocessing import cpu_count
from typing import List
from paddle.io import Dataset
from pathos.multiprocessing import Pool
from tqdm import tqdm
from ..backends.soundfile_backend import soundfile_load as load_audio
from ..utils import DATA_HOME
from ..utils import decompress
from ..utils.download import download_and_decompress
from .dataset import feat_funcs
__all__ = ['VoxCeleb']
class VoxCeleb(Dataset):
source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
archieves_audio_dev = [
{
'url': source_url + 'vox1_dev_wav_partaa',
'md5': 'e395d020928bc15670b570a21695ed96',
},
{
'url': source_url + 'vox1_dev_wav_partab',
'md5': 'bbfaaccefab65d82b21903e81a8a8020',
},
{
'url': source_url + 'vox1_dev_wav_partac',
'md5': '017d579a2a96a077f40042ec33e51512',
},
{
'url': source_url + 'vox1_dev_wav_partad',
'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
},
]
archieves_audio_test = [
{
'url': source_url + 'vox1_test_wav.zip',
'md5': '185fdc63c3c739954633d50379a3d102',
},
]
archieves_meta = [
{
'url':
'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
'md5':
'b73110731c9223c1461fe49cb48dddfc',
},
]
num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
sample_rate = 16000
meta_info = collections.namedtuple(
'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
base_path = os.path.join(DATA_HOME, 'vox1')
wav_path = os.path.join(base_path, 'wav')
meta_path = os.path.join(base_path, 'meta')
veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
csv_path = os.path.join(base_path, 'csv')
subsets = ['train', 'dev', 'enroll', 'test']
def __init__(
self,
subset: str='train',
feat_type: str='raw',
random_chunk: bool=True,
chunk_duration: float=3.0, # seconds
split_ratio: float=0.9, # train split ratio
seed: int=0,
target_dir: str=None,
vox2_base_path=None,
**kwargs):
"""VoxCeleb data prepare and get the specific dataset audio info
Args:
subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
random_chunk (bool, optional): random select a duration from audio. Defaults to True.
chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
"""
assert subset in self.subsets, \
'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
self.subset = subset
self.spk_id2label = {}
self.feat_type = feat_type
self.feat_config = kwargs
self.random_chunk = random_chunk
self.chunk_duration = chunk_duration
self.split_ratio = split_ratio
self.target_dir = target_dir if target_dir else VoxCeleb.base_path
self.vox2_base_path = vox2_base_path
# if we set the target dir, we will change the vox data info data from base path to target dir
VoxCeleb.csv_path = os.path.join(
target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
VoxCeleb.meta_path = os.path.join(
target_dir, "voxceleb",
'meta') if target_dir else VoxCeleb.meta_path
VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
'veri_test2.txt')
# self._data = self._get_data()[:1000] # KP: Small dataset test.
self._data = self._get_data()
super(VoxCeleb, self).__init__()
# Set up a seed to reproduce training or predicting result.
# random.seed(seed)
def _get_data(self):
# Download audio files.
# We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
# so, we check the vox1/wav dir status
print(f"wav base path: {self.wav_path}")
if not os.path.isdir(self.wav_path):
print("start to download the voxceleb1 dataset")
download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip
self.archieves_audio_dev,
self.base_path,
decompress=False)
download_and_decompress( # download the vox1_test_wav.zip and unzip
self.archieves_audio_test,
self.base_path,
decompress=True)
# Download all parts and concatenate the files into one zip file.
dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
print(f'Concatenating all parts to: {dev_zipfile}')
os.system(
f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
)
# Extract all audio files of dev and test set.
decompress(dev_zipfile, self.base_path)
# Download meta files.
if not os.path.isdir(self.meta_path):
print("prepare the meta data")
download_and_decompress(
self.archieves_meta, self.meta_path, decompress=False)
# Data preparation.
if not os.path.isdir(self.csv_path):
os.makedirs(self.csv_path)
self.prepare_data()
data = []
print(
f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
)
with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
for line in rf.readlines()[1:]:
audio_id, duration, wav, start, stop, spk_id = line.strip(
).split(',')
data.append(
self.meta_info(audio_id,
float(duration), wav,
int(start), int(stop), spk_id))
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
for line in f.readlines():
spk_id, label = line.strip().split(' ')
self.spk_id2label[spk_id] = int(label)
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = load_audio(record['wav'])
# random select a chunk audio samples from the audio
if self.random_chunk:
num_wav_samples = waveform.shape[0]
num_chunk_samples = int(self.chunk_duration * sr)
start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
stop = start + num_chunk_samples
else:
start = record['start']
stop = record['stop']
waveform = waveform[start:stop]
assert self.feat_type in feat_funcs.keys(), \
f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
feat_func = feat_funcs[self.feat_type]
feat = feat_func(
waveform, sr=sr, **self.feat_config) if feat_func else waveform
record.update({'feat': feat})
if self.subset in ['train',
'dev']: # Labels are available in train and dev.
record.update({'label': self.spk_id2label[record['spk_id']]})
return record
@staticmethod
def _get_chunks(seg_dur, audio_id, audio_duration):
num_chunks = int(audio_duration / seg_dur) # all in milliseconds
chunk_lst = [
audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
for i in range(num_chunks)
]
return chunk_lst
def _get_audio_info(self, wav_file: str,
split_chunks: bool) -> List[List[str]]:
waveform, sr = load_audio(wav_file)
spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
audio_duration = waveform.shape[0] / sr
ret = []
if split_chunks: # Split into pieces of self.chunk_duration seconds.
uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
audio_duration)
for chunk in uniq_chunks_list:
s, e = chunk.split("_")[-2:] # Timestamps of start and end
start_sample = int(float(s) * sr)
end_sample = int(float(e) * sr)
# id, duration, wav, start, stop, spk_id
ret.append([
chunk, audio_duration, wav_file, start_sample, end_sample,
spk_id
])
else: # Keep whole audio.
ret.append([
audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
])
return ret
def generate_csv(self,
wav_files: List[str],
output_file: str,
split_chunks: bool=True):
print(f'Generating csv: {output_file}')
header = ["id", "duration", "wav", "start", "stop", "spk_id"]
# Note: this may occurs c++ execption, but the program will execute fine
# so we can ignore the execption
with Pool(cpu_count()) as p:
infos = list(
tqdm(
p.imap(lambda x: self._get_audio_info(x, split_chunks),
wav_files),
total=len(wav_files)))
csv_lines = []
for info in infos:
csv_lines.extend(info)
with open(output_file, mode="w") as csv_f:
csv_writer = csv.writer(
csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(header)
for line in csv_lines:
csv_writer.writerow(line)
def prepare_data(self):
# Audio of speakers in veri_test_file should not be included in training set.
print("start to prepare the data csv file")
enroll_files = set()
test_files = set()
# get the enroll and test audio file path
with open(self.veri_test_file, 'r') as f:
for line in f.readlines():
_, enrol_file, test_file = line.strip().split(' ')
enroll_files.add(os.path.join(self.wav_path, enrol_file))
test_files.add(os.path.join(self.wav_path, test_file))
enroll_files = sorted(enroll_files)
test_files = sorted(test_files)
# get the enroll and test speakers
test_spks = set()
for file in (enroll_files + test_files):
spk = file.split('/wav/')[1].split('/')[0]
test_spks.add(spk)
# get all the train and dev audios file path
audio_files = []
speakers = set()
print("Getting file list...")
for path in [self.wav_path, self.vox2_base_path]:
# if vox2 directory is not set and vox2 is not a directory
# we will not process this directory
if not path or not os.path.exists(path):
print(f"{path} is an invalid path, please check again, "
"and we will ignore the vox2 base path")
continue
for file in glob.glob(
os.path.join(path, "**", "*.wav"), recursive=True):
spk = file.split('/wav/')[1].split('/')[0]
if spk in test_spks:
continue
speakers.add(spk)
audio_files.append(file)
print(
f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
)
# encode the train and dev speakers label to spk_id2label.txt
with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
for label, spk_id in enumerate(
sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2
f.write(f'{spk_id} {label}\n')
audio_files = sorted(audio_files)
random.shuffle(audio_files)
split_idx = int(self.split_ratio * len(audio_files))
# split_ratio to train
train_files, dev_files = audio_files[:split_idx], audio_files[
split_idx:]
self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
self.generate_csv(
enroll_files,
os.path.join(self.csv_path, 'enroll.csv'),
split_chunks=False)
self.generate_csv(
test_files,
os.path.join(self.csv_path, 'test.csv'),
split_chunks=False)
def __getitem__(self, idx):
return self._convert_to_record(idx)
def __len__(self):
return len(self._data)

@ -0,0 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .layers import LogMelSpectrogram
from .layers import MelSpectrogram
from .layers import MFCC
from .layers import Spectrogram

@ -0,0 +1,328 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
from typing import Optional
from typing import Union
import paddle
import paddle.nn as nn
from paddle import Tensor
from ..functional import compute_fbank_matrix
from ..functional import create_dct
from ..functional import power_to_db
from ..functional.window import get_window
__all__ = [
'Spectrogram',
'MelSpectrogram',
'LogMelSpectrogram',
'MFCC',
]
class Spectrogram(nn.Layer):
"""Compute spectrogram of given signals, typically audio waveforms.
The spectorgram is defined as the complex norm of the short-time Fourier transformation.
Args:
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
dtype: str='float32') -> None:
super(Spectrogram, self).__init__()
assert power > 0, 'Power of spectrogram must be > 0.'
self.power = power
if win_length is None:
win_length = n_fft
self.fft_window = get_window(
window, win_length, fftbins=True, dtype=dtype)
self._stft = partial(
paddle.signal.stft,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=self.fft_window,
center=center,
pad_mode=pad_mode)
self.register_buffer('fft_window', self.fft_window)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
"""
stft = self._stft(x)
spectrogram = paddle.pow(paddle.abs(stft), self.power)
return spectrogram
class MelSpectrogram(nn.Layer):
"""Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
Args:
sr (int, optional): Sample rate. Defaults to 22050.
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
sr: int=22050,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
dtype: str='float32') -> None:
super(MelSpectrogram, self).__init__()
self._spectrogram = Spectrogram(
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
power=power,
center=center,
pad_mode=pad_mode,
dtype=dtype)
self.n_mels = n_mels
self.f_min = f_min
self.f_max = f_max
self.htk = htk
self.norm = norm
if f_max is None:
f_max = sr // 2
self.fbank_matrix = compute_fbank_matrix(
sr=sr,
n_fft=n_fft,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
htk=htk,
norm=norm,
dtype=dtype) # float64 for better numerical results
self.register_buffer('fbank_matrix', self.fbank_matrix)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
"""
spect_feature = self._spectrogram(x)
mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
return mel_feature
class LogMelSpectrogram(nn.Layer):
"""Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
Args:
sr (int, optional): Sample rate. Defaults to 22050.
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
sr: int=22050,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None,
dtype: str='float32') -> None:
super(LogMelSpectrogram, self).__init__()
self._melspectrogram = MelSpectrogram(
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
power=power,
center=center,
pad_mode=pad_mode,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
htk=htk,
norm=norm,
dtype=dtype)
self.ref_value = ref_value
self.amin = amin
self.top_db = top_db
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
"""
mel_feature = self._melspectrogram(x)
log_mel_feature = power_to_db(
mel_feature,
ref_value=self.ref_value,
amin=self.amin,
top_db=self.top_db)
return log_mel_feature
class MFCC(nn.Layer):
"""Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
Args:
sr (int, optional): Sample rate. Defaults to 22050.
n_mfcc (int, optional): [description]. Defaults to 40.
n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'.
"""
def __init__(self,
sr: int=22050,
n_mfcc: int=40,
n_fft: int=512,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str='hann',
power: float=2.0,
center: bool=True,
pad_mode: str='reflect',
n_mels: int=64,
f_min: float=50.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None,
dtype: str=paddle.float32) -> None:
super(MFCC, self).__init__()
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
n_mfcc, n_mels)
self._log_melspectrogram = LogMelSpectrogram(
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
window=window,
power=power,
center=center,
pad_mode=pad_mode,
n_mels=n_mels,
f_min=f_min,
f_max=f_max,
htk=htk,
norm=norm,
ref_value=ref_value,
amin=amin,
top_db=top_db,
dtype=dtype)
self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
self.register_buffer('dct_matrix', self.dct_matrix)
def forward(self, x: Tensor) -> Tensor:
"""
Args:
x (Tensor): Tensor of waveforms with shape `(N, T)`
Returns:
Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
"""
log_mel_feature = self._log_melspectrogram(x)
mfcc = paddle.matmul(
log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
(0, 2, 1)) # (B, n_mels, L)
return mfcc

@ -0,0 +1,20 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .functional import compute_fbank_matrix
from .functional import create_dct
from .functional import fft_frequencies
from .functional import hz_to_mel
from .functional import mel_frequencies
from .functional import mel_to_hz
from .functional import power_to_db

@ -0,0 +1,266 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from librosa(https://github.com/librosa/librosa)
import math
from typing import Optional
from typing import Union
import paddle
from paddle import Tensor
__all__ = [
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'fft_frequencies',
'compute_fbank_matrix',
'power_to_db',
'create_dct',
]
def hz_to_mel(freq: Union[Tensor, float],
htk: bool=False) -> Union[Tensor, float]:
"""Convert Hz to Mels.
Args:
freq (Union[Tensor, float]): The input tensor with arbitrary shape.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
Union[Tensor, float]: Frequency in mels.
"""
if htk:
if isinstance(freq, Tensor):
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
else:
return 2595.0 * math.log10(1.0 + freq / 700.0)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (freq - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = math.log(6.4) / 27.0 # step size for log region
if isinstance(freq, Tensor):
target = min_log_mel + paddle.log(
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
mask = (freq > min_log_hz).astype(freq.dtype)
mels = target * mask + mels * (
1 - mask) # will replace by masked_fill OP in future
else:
if freq >= min_log_hz:
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
return mels
def mel_to_hz(mel: Union[float, Tensor],
htk: bool=False) -> Union[float, Tensor]:
"""Convert mel bin numbers to frequencies.
Args:
mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
htk (bool, optional): Use htk scaling. Defaults to False.
Returns:
Union[float, Tensor]: Frequencies in Hz.
"""
if htk:
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mel
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = math.log(6.4) / 27.0 # step size for log region
if isinstance(mel, Tensor):
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
mask = (mel > min_log_mel).astype(mel.dtype)
freqs = target * mask + freqs * (
1 - mask) # will replace by masked_fill OP in future
else:
if mel >= min_log_mel:
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
return freqs
def mel_frequencies(n_mels: int=64,
f_min: float=0.0,
f_max: float=11025.0,
htk: bool=False,
dtype: str='float32') -> Tensor:
"""Compute mel frequencies.
Args:
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
htk (bool, optional): Use htk scaling. Defaults to False.
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
Returns:
Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(f_min, htk=htk)
max_mel = hz_to_mel(f_max, htk=htk)
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
freqs = mel_to_hz(mels, htk=htk)
return freqs
def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
"""Compute fourier frequencies.
Args:
sr (int): Sample rate.
n_fft (int): Number of fft bins.
dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
Returns:
Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
"""
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
def compute_fbank_matrix(sr: int,
n_fft: int,
n_mels: int=64,
f_min: float=0.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
dtype: str='float32') -> Tensor:
"""Compute fbank matrix.
Args:
sr (int): Sample rate.
n_fft (int): Number of fft bins.
n_mels (int, optional): Number of mel bins. Defaults to 64.
f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
htk (bool, optional): Use htk scaling. Defaults to False.
norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
Returns:
Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
"""
if f_max is None:
f_max = float(sr) / 2
# Initialize the weights
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
#ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = paddle.maximum(
paddle.zeros_like(lower), paddle.minimum(lower, upper))
# Slaney-style mel is scaled to be approx constant energy per channel
if norm == 'slaney':
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm.unsqueeze(1)
elif isinstance(norm, int) or isinstance(norm, float):
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
return weights
def power_to_db(spect: Tensor,
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None) -> Tensor:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
Args:
spect (Tensor): STFT power spectrogram.
ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
amin (float, optional): Minimum threshold. Defaults to 1e-10.
top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
Returns:
Tensor: Power spectrogram in db scale.
"""
if amin <= 0:
raise Exception("amin must be strictly positive")
if ref_value <= 0:
raise Exception("ref_value must be strictly positive")
ones = paddle.ones_like(spect)
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
log_spec -= 10.0 * math.log10(max(ref_value, amin))
if top_db is not None:
if top_db < 0:
raise Exception("top_db must be non-negative")
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
return log_spec
def create_dct(n_mfcc: int,
n_mels: int,
norm: Optional[str]='ortho',
dtype: str='float32') -> Tensor:
"""Create a discrete cosine transform(DCT) matrix.
Args:
n_mfcc (int): Number of mel frequency cepstral coefficients.
n_mels (int): Number of mel filterbanks.
norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
Returns:
Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
"""
n = paddle.arange(n_mels, dtype=dtype)
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
k) # size (n_mfcc, n_mels)
if norm is None:
dct *= 2.0
else:
assert norm == "ortho"
dct[0] *= 1.0 / math.sqrt(2.0)
dct *= math.sqrt(2.0 / float(n_mels))
return dct.T

@ -0,0 +1,337 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import math
from typing import List
from typing import Tuple
from typing import Union
import paddle
from paddle import Tensor
__all__ = [
'get_window',
]
def _cat(x: List[Tensor], data_type: str) -> Tensor:
l = [paddle.to_tensor(_, data_type) for _ in x]
return paddle.concat(l)
def _acosh(x: Union[Tensor, float]) -> Tensor:
if isinstance(x, float):
return math.log(x + math.sqrt(x**2 - 1))
return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
def _extend(M: int, sym: bool) -> bool:
"""Extend window by 1 sample if needed for DFT-even symmetry. """
if not sym:
return M + 1, True
else:
return M, False
def _len_guards(M: int) -> bool:
"""Handle small or incorrect window lengths. """
if int(M) != M or M < 0:
raise ValueError('Window length M must be a non-negative integer')
return M <= 1
def _truncate(w: Tensor, needed: bool) -> Tensor:
"""Truncate window by 1 sample if needed for DFT-even symmetry. """
if needed:
return w[:-1]
else:
return w
def _general_gaussian(M: int, p, sig, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a window with a generalized Gaussian shape.
This function is consistent with scipy.signal.windows.general_gaussian().
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
return _truncate(w, needs_trunc)
def _general_cosine(M: int, a: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a generic weighted sum of cosine terms window.
This function is consistent with scipy.signal.windows.general_cosine().
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
w = paddle.zeros((M, ), dtype=dtype)
for k in range(len(a)):
w += a[k] * paddle.cos(k * fac)
return _truncate(w, needs_trunc)
def _general_hamming(M: int, alpha: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a generalized Hamming window.
This function is consistent with scipy.signal.windows.general_hamming()
"""
return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
def _taylor(M: int,
nbar=4,
sll=30,
norm=True,
sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a Taylor window.
The Taylor window taper function approximates the Dolph-Chebyshev window's
constant sidelobe level for a parameterized number of near-in sidelobes.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
# Original text uses a negative sidelobe level parameter and then negates
# it in the calculation of B. To keep consistent with other methods we
# assume the sidelobe level parameter to be positive.
B = 10**(sll / 20)
A = _acosh(B) / math.pi
s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
ma = paddle.arange(1, nbar, dtype=dtype)
Fm = paddle.empty((nbar - 1, ), dtype=dtype)
signs = paddle.empty_like(ma)
signs[::2] = 1
signs[1::2] = -1
m2 = ma * ma
for mi in range(len(ma)):
numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
))
if mi == 0:
denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
elif mi == len(ma) - 1:
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
else:
denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
mi] / m2[mi + 1:])
Fm[mi] = numer / denom
def W(n):
return 1 + 2 * paddle.matmul(
Fm.unsqueeze(0),
paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
w = W(paddle.arange(0, M, dtype=dtype))
# normalize (Note that this is not described in the original text [1])
if norm:
scale = 1.0 / W((M - 1) / 2)
w *= scale
w = w.squeeze()
return _truncate(w, needs_trunc)
def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Hamming window.
The Hamming window is a taper formed by using a raised cosine with
non-zero endpoints, optimized to minimize the nearest side lobe.
"""
return _general_hamming(M, 0.54, sym, dtype=dtype)
def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Hann window.
The Hann window is a taper formed by using a raised cosine or sine-squared
with ends that touch zero.
"""
return _general_hamming(M, 0.5, sym, dtype=dtype)
def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Tukey window.
The Tukey window is also known as a tapered cosine window.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
if alpha <= 0:
return paddle.ones((M, ), dtype=dtype)
elif alpha >= 1.0:
return hann(M, sym=sym)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype)
width = int(alpha * (M - 1) / 2.0)
n1 = n[0:width + 1]
n2 = n[width + 1:M - width - 1]
n3 = n[M - width - 1:]
w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
w2 = paddle.ones(n2.shape, dtype=dtype)
w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
(M - 1))))
w = paddle.concat([w1, w2, w3])
return _truncate(w, needs_trunc)
def _kaiser(M: int, beta: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a Kaiser window.
The Kaiser window is a taper formed by using a Bessel function.
"""
raise NotImplementedError()
def _gaussian(M: int, std: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a Gaussian window.
The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
sig2 = 2 * std * std
w = paddle.exp(-n**2 / sig2)
return _truncate(w, needs_trunc)
def _exponential(M: int,
center=None,
tau=1.,
sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute an exponential (or Poisson) window. """
if sym and center is not None:
raise ValueError("If sym==True, center must be None.")
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
if center is None:
center = (M - 1) / 2
n = paddle.arange(0, M, dtype=dtype)
w = paddle.exp(-paddle.abs(n - center) / tau)
return _truncate(w, needs_trunc)
def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a triangular window.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
if M % 2 == 0:
w = (2 * n - 1.0) / M
w = paddle.concat([w, w[::-1]])
else:
w = 2 * n / (M + 1.0)
w = paddle.concat([w, w[-2::-1]])
return _truncate(w, needs_trunc)
def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Bohman window.
The Bohman window is the autocorrelation of a cosine window.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
math.pi * fac)
w = _cat([0, w, 0], dtype)
return _truncate(w, needs_trunc)
def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Blackman window.
The Blackman window is a taper formed by using the first three terms of
a summation of cosines. It was designed to have close to the minimal
leakage possible. It is close to optimal, only slightly worse than a
Kaiser window.
"""
return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a window with a simple cosine shape.
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
return _truncate(w, needs_trunc)
def get_window(window: Union[str, Tuple[str, float]],
win_length: int,
fftbins: bool=True,
dtype: str='float64') -> Tensor:
"""Return a window of a given length and type.
Args:
window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
win_length (int): Number of samples.
fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
dtype (str, optional): The data type of the return window. Defaults to 'float64'.
Returns:
Tensor: The window represented as a tensor.
"""
sym = not fftbins
args = ()
if isinstance(window, tuple):
winstr = window[0]
if len(window) > 1:
args = window[1:]
elif isinstance(window, str):
if window in ['gaussian', 'exponential']:
raise ValueError("The '" + window + "' window needs one or "
"more parameters -- pass a tuple.")
else:
winstr = window
else:
raise ValueError("%s as window type is not supported." %
str(type(window)))
try:
winfunc = eval('_' + winstr)
except KeyError as e:
raise ValueError("Unknown window type.") from e
params = (win_length, ) + args
kwargs = {'sym': sym}
return winfunc(*params, dtype=dtype, **kwargs)

@ -0,0 +1,13 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -0,0 +1,15 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .eer import compute_eer
from .eer import compute_minDCF

@ -0,0 +1,100 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np
import paddle
from sklearn.metrics import roc_curve
def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
"""Compute EER and return score threshold.
Args:
labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
Returns:
List[float]: eer and the specific threshold
"""
fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
fnr = 1 - tpr
eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
return eer, eer_threshold
def compute_minDCF(positive_scores,
negative_scores,
c_miss=1.0,
c_fa=1.0,
p_target=0.01):
"""
This is modified from SpeechBrain
https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
Computes the minDCF metric normally used to evaluate speaker verification
systems. The min_DCF is the minimum of the following C_det function computed
within the defined threshold range:
C_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
where p_miss is the missing probability and p_fa is the probability of having
a false alarm.
Args:
positive_scores (Paddle.Tensor): The scores from entries of the same class.
negative_scores (Paddle.Tensor): The scores from entries of different classes.
c_miss (float, optional): Cost assigned to a missing error (default 1.0).
c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
p_target (float, optional): Prior probability of having a target (default 0.01).
Returns:
List[float]: min dcf and the specific threshold
"""
# Computing candidate thresholds
if len(positive_scores.shape) > 1:
positive_scores = positive_scores.squeeze()
if len(negative_scores.shape) > 1:
negative_scores = negative_scores.squeeze()
thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
thresholds = paddle.unique(thresholds)
# Adding intermediate thresholds
interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
# Computing False Rejection Rate (miss detection)
positive_scores = paddle.concat(
len(thresholds) * [positive_scores.unsqueeze(0)])
pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
p_miss = (pos_scores_threshold.sum(0)
).astype("float32") / positive_scores.shape[1]
del positive_scores
del pos_scores_threshold
# Computing False Acceptance Rate (false alarm)
negative_scores = paddle.concat(
len(thresholds) * [negative_scores.unsqueeze(0)])
neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
p_fa = (neg_scores_threshold.sum(0)
).astype("float32") / negative_scores.shape[1]
del negative_scores
del neg_scores_threshold
c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
c_min = paddle.min(c_det, axis=0)
min_index = paddle.argmin(c_det, axis=0)
return float(c_min), float(thresholds[min_index])

@ -0,0 +1,13 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -0,0 +1,27 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .download import decompress
from .download import download_and_decompress
from .download import load_state_dict_from_url
from .env import DATA_HOME
from .env import MODEL_HOME
from .env import PPAUDIO_HOME
from .env import USER_HOME
from .error import ParameterError
from .log import Logger
from .log import logger
from .time import seconds_to_hms
from .time import Timer
from .numeric import depth_convert
from .numeric import pcm16to32

@ -0,0 +1,64 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Dict
from typing import List
from paddle.framework import load as load_state_dict
from paddle.utils import download
from .log import logger
download.logger = logger
__all__ = [
'decompress',
'download_and_decompress',
'load_state_dict_from_url',
]
def decompress(file: str):
"""
Extracts all files from a compressed file.
"""
assert os.path.isfile(file), "File: {} not exists.".format(file)
download._decompress(file)
def download_and_decompress(archives: List[Dict[str, str]],
path: str,
decompress: bool=True):
"""
Download archieves and decompress to specific path.
"""
if not os.path.isdir(path):
os.makedirs(path)
for archive in archives:
assert 'url' in archive and 'md5' in archive, \
'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
download.get_path_from_url(
archive['url'], path, archive['md5'], decompress=decompress)
def load_state_dict_from_url(url: str, path: str, md5: str=None):
"""
Download and load a state dict from url
"""
if not os.path.isdir(path):
os.makedirs(path)
download.get_path_from_url(url, path, md5)
return load_state_dict(os.path.join(path, os.path.basename(url)))

@ -0,0 +1,60 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
This module is used to store environmental variables in PaddleAudio.
PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
default value through the PPAUDIO_HOME environment variable.
MODEL_HOME --> Store model files.
DATA_HOME --> Store automatically downloaded datasets.
'''
import os
__all__ = [
'USER_HOME',
'PPAUDIO_HOME',
'MODEL_HOME',
'DATA_HOME',
]
def _get_user_home():
return os.path.expanduser('~')
def _get_ppaudio_home():
if 'PPAUDIO_HOME' in os.environ:
home_path = os.environ['PPAUDIO_HOME']
if os.path.exists(home_path):
if os.path.isdir(home_path):
return home_path
else:
raise RuntimeError(
'The environment variable PPAUDIO_HOME {} is not a directory.'.
format(home_path))
else:
return home_path
return os.path.join(_get_user_home(), '.paddleaudio')
def _get_sub_home(directory):
home = os.path.join(_get_ppaudio_home(), directory)
if not os.path.exists(home):
os.makedirs(home)
return home
USER_HOME = _get_user_home()
PPAUDIO_HOME = _get_ppaudio_home()
MODEL_HOME = _get_sub_home('models')
DATA_HOME = _get_sub_home('datasets')

@ -0,0 +1,20 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['ParameterError']
class ParameterError(Exception):
"""Exception class for Parameter checking"""
pass

@ -0,0 +1,139 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import functools
import logging
import threading
import time
import colorlog
__all__ = [
'Logger',
'logger',
]
log_config = {
'DEBUG': {
'level': 10,
'color': 'purple'
},
'INFO': {
'level': 20,
'color': 'green'
},
'TRAIN': {
'level': 21,
'color': 'cyan'
},
'EVAL': {
'level': 22,
'color': 'blue'
},
'WARNING': {
'level': 30,
'color': 'yellow'
},
'ERROR': {
'level': 40,
'color': 'red'
},
'CRITICAL': {
'level': 50,
'color': 'bold_red'
}
}
class Logger(object):
'''
Deafult logger in PaddleAudio
Args:
name(str) : Logger name, default is 'PaddleAudio'
'''
def __init__(self, name: str=None):
name = 'PaddleAudio' if not name else name
self.logger = logging.getLogger(name)
for key, conf in log_config.items():
logging.addLevelName(conf['level'], key)
self.__dict__[key] = functools.partial(self.__call__, conf['level'])
self.__dict__[key.lower()] = functools.partial(self.__call__,
conf['level'])
self.format = colorlog.ColoredFormatter(
'%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
log_colors={key: conf['color']
for key, conf in log_config.items()})
self.handler = logging.StreamHandler()
self.handler.setFormatter(self.format)
self.logger.addHandler(self.handler)
self.logLevel = 'DEBUG'
self.logger.setLevel(logging.DEBUG)
self.logger.propagate = False
self._is_enable = True
def disable(self):
self._is_enable = False
def enable(self):
self._is_enable = True
@property
def is_enable(self) -> bool:
return self._is_enable
def __call__(self, log_level: str, msg: str):
if not self.is_enable:
return
self.logger.log(log_level, msg)
@contextlib.contextmanager
def use_terminator(self, terminator: str):
old_terminator = self.handler.terminator
self.handler.terminator = terminator
yield
self.handler.terminator = old_terminator
@contextlib.contextmanager
def processing(self, msg: str, interval: float=0.1):
'''
Continuously print a progress bar with rotating special effects.
Args:
msg(str): Message to be printed.
interval(float): Rotation interval. Default to 0.1.
'''
end = False
def _printer():
index = 0
flags = ['\\', '|', '/', '-']
while not end:
flag = flags[index % len(flags)]
with self.use_terminator('\r'):
self.info('{}: {}'.format(msg, flag))
time.sleep(interval)
index += 1
t = threading.Thread(target=_printer)
t.start()
yield
end = True
logger = Logger()

@ -0,0 +1,107 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union
import numpy as np
__all__ = ["pcm16to32", "depth_convert"]
def pcm16to32(audio: np.ndarray) -> np.ndarray:
"""pcm int16 to float32
Args:
audio (np.ndarray): Waveform with dtype of int16.
Returns:
np.ndarray: Waveform with dtype of float32.
"""
if audio.dtype == np.int16:
audio = audio.astype("float32")
bits = np.iinfo(np.int16).bits
audio = audio / (2**(bits - 1))
return audio
def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
"""Data type casting in a safe way, i.e., prevent overflow or underflow.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
dtype (Union[type, str]): Data type of waveform.
Returns:
np.ndarray: `y` after safe casting.
"""
if 'float' in str(y.dtype):
return np.clip(y, np.finfo(dtype).min,
np.finfo(dtype).max).astype(dtype)
else:
return np.clip(y, np.iinfo(dtype).min,
np.iinfo(dtype).max).astype(dtype)
def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
"""Convert audio array to target dtype safely.
This function convert audio waveform to a target dtype, with addition steps of
preventing overflow/underflow and preserving audio range.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
dtype (Union[type, str]): Data type of waveform.
Returns:
np.ndarray: `y` after safe casting.
"""
SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
if y.dtype not in SUPPORT_DTYPE:
raise ParameterError(
'Unsupported audio dtype, '
f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
if dtype not in SUPPORT_DTYPE:
raise ParameterError(
'Unsupported audio dtype, '
f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
if dtype == y.dtype:
return y
if dtype == 'float64' and y.dtype == 'float32':
return _safe_cast(y, dtype)
if dtype == 'float32' and y.dtype == 'float64':
return _safe_cast(y, dtype)
if dtype == 'int16' or dtype == 'int8':
if y.dtype in ['float64', 'float32']:
factor = np.iinfo(dtype).max
y = np.clip(y * factor, np.iinfo(dtype).min,
np.iinfo(dtype).max).astype(dtype)
y = y.astype(dtype)
else:
if dtype == 'int16' and y.dtype == 'int8':
factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
y = y.astype('float32') * factor
y = y.astype('int16')
else: # dtype == 'int8' and y.dtype=='int16':
y = y.astype('int32') * np.iinfo('int8').max / \
np.iinfo('int16').max
y = y.astype('int8')
if dtype in ['float32', 'float64']:
org_dtype = y.dtype
y = y.astype(dtype) / np.iinfo(org_dtype).max
return y

@ -0,0 +1,72 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import time
__all__ = [
'Timer',
'seconds_to_hms',
]
class Timer(object):
'''Calculate runing speed and estimated time of arrival(ETA)'''
def __init__(self, total_step: int):
self.total_step = total_step
self.last_start_step = 0
self.current_step = 0
self._is_running = True
def start(self):
self.last_time = time.time()
self.start_time = time.time()
def stop(self):
self._is_running = False
self.end_time = time.time()
def count(self) -> int:
if not self.current_step >= self.total_step:
self.current_step += 1
return self.current_step
@property
def timing(self) -> float:
run_steps = self.current_step - self.last_start_step
self.last_start_step = self.current_step
time_used = time.time() - self.last_time
self.last_time = time.time()
return run_steps / time_used
@property
def is_running(self) -> bool:
return self._is_running
@property
def eta(self) -> str:
if not self.is_running:
return '00:00:00'
scale = self.total_step / self.current_step
remaining_time = (time.time() - self.start_time) * scale
return seconds_to_hms(remaining_time)
def seconds_to_hms(seconds: int) -> str:
'''Convert the number of seconds to hh:mm:ss'''
h = math.floor(seconds / 3600)
m = math.floor((seconds - h * 3600) / 60)
s = int(seconds - h * 3600 - m * 60)
hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
return hms_str

@ -0,0 +1,99 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
import setuptools
from setuptools.command.install import install
from setuptools.command.test import test
# set the version here
VERSION = '1.0.2'
# Inspired by the example at https://pytest.org/latest/goodpractises.html
class TestCommand(test):
def finalize_options(self):
test.finalize_options(self)
self.test_args = []
self.test_suite = True
def run(self):
self.run_benchmark()
super(TestCommand, self).run()
def run_tests(self):
# Run nose ensuring that argv simulates running nosetests directly
import nose
nose.run_exit(argv=['nosetests', '-w', 'tests'])
def run_benchmark(self):
for benchmark_item in glob.glob('tests/benchmark/*py'):
os.system(f'pytest {benchmark_item}')
class InstallCommand(install):
def run(self):
install.run(self)
def write_version_py(filename='paddleaudio/__init__.py'):
with open(filename, "a") as f:
f.write(f"__version__ = '{VERSION}'")
def remove_version_py(filename='paddleaudio/__init__.py'):
with open(filename, "r") as f:
lines = f.readlines()
with open(filename, "w") as f:
for line in lines:
if "__version__" not in line:
f.write(line)
remove_version_py()
write_version_py()
setuptools.setup(
name="paddleaudio",
version=VERSION,
author="",
author_email="",
description="PaddleAudio, in development",
long_description="",
long_description_content_type="text/markdown",
url="",
packages=setuptools.find_packages(include=['paddleaudio*']),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
install_requires=[
'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
'soundfile >= 0.9.0', 'colorlog', 'pathos == 0.2.8'
],
extras_require={
'test': [
'nose', 'librosa==0.8.1', 'soundfile==0.10.3.post1',
'torchaudio==0.10.2', 'pytest-benchmark'
],
},
cmdclass={
'install': InstallCommand,
'test': TestCommand,
}, )
remove_version_py()

@ -32,9 +32,9 @@ __all__ = [
'to_mono',
'normalize',
'save',
'soudfile_save',
'soundfile_save',
'load',
'load_old',
'soundfile_load',
'info',
'to_mono'
]
@ -659,4 +659,4 @@ def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
sinfo.channels,
bits_per_sample=_get_bit_depth(sinfo.subtype),
encoding=_get_encoding(sinfo.format, sinfo.subtype),
)
)

Loading…
Cancel
Save