From 4df081b9542ac1545bbf07f78c15b0778b5694ca Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Thu, 15 Sep 2022 15:17:26 +0800 Subject: [PATCH 1/2] split paddleaudio --- audio/paddleaudio/__init__.py | 21 + audio/paddleaudio/_internal/__init__.py | 0 audio/paddleaudio/_internal/module_utils.py | 148 ++++ audio/paddleaudio/backends/__init__.py | 26 + audio/paddleaudio/backends/common.py | 55 ++ audio/paddleaudio/backends/no_backend.py | 32 + .../paddleaudio/backends/soundfile_backend.py | 661 +++++++++++++++ audio/paddleaudio/backends/sox_io_backend.py | 101 +++ audio/paddleaudio/backends/utils.py | 81 ++ audio/paddleaudio/compliance/__init__.py | 15 + audio/paddleaudio/compliance/kaldi.py | 638 ++++++++++++++ audio/paddleaudio/compliance/librosa.py | 788 ++++++++++++++++++ audio/paddleaudio/datasets/__init__.py | 20 + audio/paddleaudio/datasets/dataset.py | 100 +++ audio/paddleaudio/datasets/esc50.py | 152 ++++ audio/paddleaudio/datasets/gtzan.py | 115 +++ audio/paddleaudio/datasets/hey_snips.py | 74 ++ audio/paddleaudio/datasets/rirs_noises.py | 201 +++++ audio/paddleaudio/datasets/tess.py | 126 +++ audio/paddleaudio/datasets/urban_sound.py | 104 +++ audio/paddleaudio/datasets/voxceleb.py | 356 ++++++++ audio/paddleaudio/features/__init__.py | 17 + audio/paddleaudio/features/layers.py | 328 ++++++++ audio/paddleaudio/functional/__init__.py | 20 + audio/paddleaudio/functional/functional.py | 266 ++++++ audio/paddleaudio/functional/window.py | 337 ++++++++ audio/paddleaudio/io/__init__.py | 13 + audio/paddleaudio/metric/__init__.py | 15 + audio/paddleaudio/metric/eer.py | 100 +++ audio/paddleaudio/sox_effects/__init__.py | 13 + audio/paddleaudio/utils/__init__.py | 27 + audio/paddleaudio/utils/download.py | 64 ++ audio/paddleaudio/utils/env.py | 60 ++ audio/paddleaudio/utils/error.py | 20 + audio/paddleaudio/utils/log.py | 139 +++ audio/paddleaudio/utils/numeric.py | 107 +++ audio/paddleaudio/utils/time.py | 72 ++ audio/setup.py | 99 +++ .../audio/backends/soundfile_backend.py | 6 +- 39 files changed, 5514 insertions(+), 3 deletions(-) create mode 100644 audio/paddleaudio/__init__.py create mode 100644 audio/paddleaudio/_internal/__init__.py create mode 100644 audio/paddleaudio/_internal/module_utils.py create mode 100644 audio/paddleaudio/backends/__init__.py create mode 100644 audio/paddleaudio/backends/common.py create mode 100644 audio/paddleaudio/backends/no_backend.py create mode 100644 audio/paddleaudio/backends/soundfile_backend.py create mode 100644 audio/paddleaudio/backends/sox_io_backend.py create mode 100644 audio/paddleaudio/backends/utils.py create mode 100644 audio/paddleaudio/compliance/__init__.py create mode 100644 audio/paddleaudio/compliance/kaldi.py create mode 100644 audio/paddleaudio/compliance/librosa.py create mode 100644 audio/paddleaudio/datasets/__init__.py create mode 100644 audio/paddleaudio/datasets/dataset.py create mode 100644 audio/paddleaudio/datasets/esc50.py create mode 100644 audio/paddleaudio/datasets/gtzan.py create mode 100644 audio/paddleaudio/datasets/hey_snips.py create mode 100644 audio/paddleaudio/datasets/rirs_noises.py create mode 100644 audio/paddleaudio/datasets/tess.py create mode 100644 audio/paddleaudio/datasets/urban_sound.py create mode 100644 audio/paddleaudio/datasets/voxceleb.py create mode 100644 audio/paddleaudio/features/__init__.py create mode 100644 audio/paddleaudio/features/layers.py create mode 100644 audio/paddleaudio/functional/__init__.py create mode 100644 audio/paddleaudio/functional/functional.py create mode 100644 audio/paddleaudio/functional/window.py create mode 100644 audio/paddleaudio/io/__init__.py create mode 100644 audio/paddleaudio/metric/__init__.py create mode 100644 audio/paddleaudio/metric/eer.py create mode 100644 audio/paddleaudio/sox_effects/__init__.py create mode 100644 audio/paddleaudio/utils/__init__.py create mode 100644 audio/paddleaudio/utils/download.py create mode 100644 audio/paddleaudio/utils/env.py create mode 100644 audio/paddleaudio/utils/error.py create mode 100644 audio/paddleaudio/utils/log.py create mode 100644 audio/paddleaudio/utils/numeric.py create mode 100644 audio/paddleaudio/utils/time.py create mode 100644 audio/setup.py diff --git a/audio/paddleaudio/__init__.py b/audio/paddleaudio/__init__.py new file mode 100644 index 000000000..381f7e681 --- /dev/null +++ b/audio/paddleaudio/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from . import compliance +from . import datasets +from . import features +from . import functional +from . import io +from . import metric +from . import sox_effects +from . import backends diff --git a/audio/paddleaudio/_internal/__init__.py b/audio/paddleaudio/_internal/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/audio/paddleaudio/_internal/module_utils.py b/audio/paddleaudio/_internal/module_utils.py new file mode 100644 index 000000000..76e6701d6 --- /dev/null +++ b/audio/paddleaudio/_internal/module_utils.py @@ -0,0 +1,148 @@ +import importlib.util +import warnings +from functools import wraps +from typing import Optional + +#code is from https://github.com/pytorch/audio/blob/main/torchaudio/_internal/module_utils.py + + +def is_module_available(*modules: str) -> bool: + r"""Returns if a top-level module with :attr:`name` exists *without** + importing it. This is generally safer than try-catch block around a + `import X`. It avoids third party libraries breaking assumptions of some of + our tests, e.g., setting multiprocessing start method when imported + (see librosa/#747, torchvision/#544). + """ + return all(importlib.util.find_spec(m) is not None for m in modules) + + +def requires_module(*modules: str): + """Decorate function to give error message if invoked without required optional modules. + This decorator is to give better error message to users rather + than raising ``NameError: name 'module' is not defined`` at random places. + """ + missing = [m for m in modules if not is_module_available(m)] + + if not missing: + # fall through. If all the modules are available, no need to decorate + def decorator(func): + return func + + else: + req = f"module: {missing[0]}" if len( + missing) == 1 else f"modules: {missing}" + + def decorator(func): + @wraps(func) + def wrapped(*args, **kwargs): + raise RuntimeError( + f"{func.__module__}.{func.__name__} requires {req}") + + return wrapped + + return decorator + + +def deprecated(direction: str, version: Optional[str]=None): + """Decorator to add deprecation message + Args: + direction (str): Migration steps to be given to users. + version (str or int): The version when the object will be removed + """ + + def decorator(func): + @wraps(func) + def wrapped(*args, **kwargs): + message = ( + f"{func.__module__}.{func.__name__} has been deprecated " + f'and will be removed from {"future" if version is None else version} release. ' + f"{direction}") + warnings.warn(message, stacklevel=2) + return func(*args, **kwargs) + + return wrapped + + return decorator + + +def is_kaldi_available(): + return is_module_available("paddleaudio._paddleaudio") + + +def requires_kaldi(): + if is_kaldi_available(): + + def decorator(func): + return func + + else: + + def decorator(func): + @wraps(func) + def wrapped(*args, **kwargs): + raise RuntimeError( + f"{func.__module__}.{func.__name__} requires kaldi") + + return wrapped + + return decorator + + +def _check_soundfile_importable(): + if not is_module_available("soundfile"): + return False + try: + import soundfile # noqa: F401 + + return True + except Exception: + warnings.warn( + "Failed to import soundfile. 'soundfile' backend is not available.") + return False + + +_is_soundfile_importable = _check_soundfile_importable() + + +def is_soundfile_available(): + return _is_soundfile_importable + + +def requires_soundfile(): + if is_soundfile_available(): + + def decorator(func): + return func + else: + + def decorator(func): + @wraps(func) + def wrapped(*args, **kwargs): + raise RuntimeError( + f"{func.__module__}.{func.__name__} requires soundfile") + + return wrapped + + return decorator + + +def is_sox_available(): + return is_module_available("paddleaudio._paddleaudio") + + +def requires_sox(): + if is_sox_available(): + + def decorator(func): + return func + else: + + def decorator(func): + @wraps(func) + def wrapped(*args, **kwargs): + raise RuntimeError( + f"{func.__module__}.{func.__name__} requires sox") + + return wrapped + + return decorator diff --git a/audio/paddleaudio/backends/__init__.py b/audio/paddleaudio/backends/__init__.py new file mode 100644 index 000000000..8fcd0765f --- /dev/null +++ b/audio/paddleaudio/backends/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .soundfile_backend import depth_convert +from .soundfile_backend import soundfile_load +from .soundfile_backend import normalize +from .soundfile_backend import resample +from .soundfile_backend import soundfile_save +from .soundfile_backend import to_mono + +from . import utils +from .utils import get_audio_backend +from .utils import list_audio_backends +from .utils import set_audio_backend + +utils._init_audio_backend() \ No newline at end of file diff --git a/audio/paddleaudio/backends/common.py b/audio/paddleaudio/backends/common.py new file mode 100644 index 000000000..9d3edf812 --- /dev/null +++ b/audio/paddleaudio/backends/common.py @@ -0,0 +1,55 @@ +# Token form https://github.com/pytorch/audio/blob/main/torchaudio/backend/common.py with modification. + +class AudioInfo: + """return of info function. + + This class is used by :ref:`"sox_io" backend` and + :ref:`"soundfile" backend with the new interface`. + + :ivar int sample_rate: Sample rate + :ivar int num_frames: The number of frames + :ivar int num_channels: The number of channels + :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats, + or when it cannot be accurately inferred. + :ivar str encoding: Audio encoding + The values encoding can take are one of the following: + + * ``PCM_S``: Signed integer linear PCM + * ``PCM_U``: Unsigned integer linear PCM + * ``PCM_F``: Floating point linear PCM + * ``FLAC``: Flac, Free Lossless Audio Codec + * ``ULAW``: Mu-law + * ``ALAW``: A-law + * ``MP3`` : MP3, MPEG-1 Audio Layer III + * ``VORBIS``: OGG Vorbis + * ``AMR_WB``: Adaptive Multi-Rate + * ``AMR_NB``: Adaptive Multi-Rate Wideband + * ``OPUS``: Opus + * ``HTK``: Single channel 16-bit PCM + * ``UNKNOWN`` : None of above + """ + + def __init__( + self, + sample_rate: int, + num_frames: int, + num_channels: int, + bits_per_sample: int, + encoding: str, + ): + self.sample_rate = sample_rate + self.num_frames = num_frames + self.num_channels = num_channels + self.bits_per_sample = bits_per_sample + self.encoding = encoding + + def __str__(self): + return ( + f"AudioMetaData(" + f"sample_rate={self.sample_rate}, " + f"num_frames={self.num_frames}, " + f"num_channels={self.num_channels}, " + f"bits_per_sample={self.bits_per_sample}, " + f"encoding={self.encoding}" + f")" + ) diff --git a/audio/paddleaudio/backends/no_backend.py b/audio/paddleaudio/backends/no_backend.py new file mode 100644 index 000000000..157536f46 --- /dev/null +++ b/audio/paddleaudio/backends/no_backend.py @@ -0,0 +1,32 @@ +from pathlib import Path +from typing import Callable +from typing import Optional +from typing import Tuple +from typing import Union + +from paddle import Tensor + +#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py + + +def load( + filepath: Union[str, Path], + out: Optional[Tensor]=None, + normalization: Union[bool, float, Callable]=True, + channels_first: bool=True, + num_frames: int=0, + offset: int=0, + filetype: Optional[str]=None, ) -> Tuple[Tensor, int]: + raise RuntimeError("No audio I/O backend is available.") + + +def save(filepath: str, + src: Tensor, + sample_rate: int, + precision: int=16, + channels_first: bool=True) -> None: + raise RuntimeError("No audio I/O backend is available.") + + +def info(filepath: str) -> None: + raise RuntimeError("No audio I/O backend is available.") diff --git a/audio/paddleaudio/backends/soundfile_backend.py b/audio/paddleaudio/backends/soundfile_backend.py new file mode 100644 index 000000000..e1546fedd --- /dev/null +++ b/audio/paddleaudio/backends/soundfile_backend.py @@ -0,0 +1,661 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import warnings +from typing import Optional +from typing import Tuple + +import numpy as np +import paddle +import resampy +import soundfile +from scipy.io import wavfile + +from ..utils import depth_convert +from ..utils import ParameterError +from .common import AudioInfo + +__all__ = [ + 'resample', + 'to_mono', + 'normalize', + 'save', + 'soundfile_save', + 'load', + 'soundfile_load', + 'info', +] +NORMALMIZE_TYPES = ['linear', 'gaussian'] +MERGE_TYPES = ['ch0', 'ch1', 'random', 'average'] +RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast'] +EPS = 1e-8 + + +def resample(y: np.ndarray, + src_sr: int, + target_sr: int, + mode: str='kaiser_fast') -> np.ndarray: + """Audio resampling. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + src_sr (int): Source sample rate. + target_sr (int): Target sample rate. + mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + np.ndarray: `y` resampled to `target_sr` + """ + + if mode == 'kaiser_best': + warnings.warn( + f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \ + we recommend the mode kaiser_fast in large scale audio trainning') + + if not isinstance(y, np.ndarray): + raise ParameterError( + 'Only support numpy np.ndarray, but received y in {type(y)}') + + if mode not in RESAMPLE_MODES: + raise ParameterError(f'resample mode must in {RESAMPLE_MODES}') + + return resampy.resample(y, src_sr, target_sr, filter=mode) + + +def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray: + """Convert sterior audio to mono. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'. + + Returns: + np.ndarray: `y` with mono channel. + """ + + if merge_type not in MERGE_TYPES: + raise ParameterError( + f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}' + ) + if y.ndim > 2: + raise ParameterError( + f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}') + if y.ndim == 1: # nothing to merge + return y + + if merge_type == 'ch0': + return y[0] + if merge_type == 'ch1': + return y[1] + if merge_type == 'random': + return y[np.random.randint(0, 2)] + + # need to do averaging according to dtype + + if y.dtype == 'float32': + y_out = (y[0] + y[1]) * 0.5 + elif y.dtype == 'int16': + y_out = y.astype('int32') + y_out = (y_out[0] + y_out[1]) // 2 + y_out = np.clip(y_out, np.iinfo(y.dtype).min, + np.iinfo(y.dtype).max).astype(y.dtype) + + elif y.dtype == 'int8': + y_out = y.astype('int16') + y_out = (y_out[0] + y_out[1]) // 2 + y_out = np.clip(y_out, np.iinfo(y.dtype).min, + np.iinfo(y.dtype).max).astype(y.dtype) + else: + raise ParameterError(f'Unsupported dtype: {y.dtype}') + return y_out + + +def soundfile_load_(file: os.PathLike, + offset: Optional[float]=None, + dtype: str='int16', + duration: Optional[int]=None) -> Tuple[np.ndarray, int]: + """Load audio using soundfile library. This function load audio file using libsndfile. + + Args: + file (os.PathLike): File of waveform. + offset (Optional[float], optional): Offset to the start of waveform. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'int16'. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. + """ + with soundfile.SoundFile(file) as sf_desc: + sr_native = sf_desc.samplerate + if offset: + sf_desc.seek(int(offset * sr_native)) + if duration is not None: + frame_duration = int(duration * sr_native) + else: + frame_duration = -1 + y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T + + return y, sf_desc.samplerate + + +def normalize(y: np.ndarray, norm_type: str='linear', + mul_factor: float=1.0) -> np.ndarray: + """Normalize an input audio with additional multiplier. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + mul_factor (float, optional): Scaling factor. Defaults to 1.0. + + Returns: + np.ndarray: `y` after normalization. + """ + + if norm_type == 'linear': + amax = np.max(np.abs(y)) + factor = 1.0 / (amax + EPS) + y = y * factor * mul_factor + elif norm_type == 'gaussian': + amean = np.mean(y) + astd = np.std(y) + astd = max(astd, EPS) + y = mul_factor * (y - amean) / astd + else: + raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}') + + return y + + +def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None: + """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + sr (int): Sample rate. + file (os.PathLike): Path of auido file to save. + """ + if not file.endswith('.wav'): + raise ParameterError( + f'only .wav file supported, but dst file name is: {file}') + + if sr <= 0: + raise ParameterError( + f'Sample rate should be larger than 0, recieved sr = {sr}') + + if y.dtype not in ['int16', 'int8']: + warnings.warn( + f'input data type is {y.dtype}, will convert data to int16 format before saving' + ) + y_out = depth_convert(y, 'int16') + else: + y_out = y + + wavfile.write(file, sr, y_out) + +def soundfile_load( + file: os.PathLike, + sr: Optional[int]=None, + mono: bool=True, + merge_type: str='average', # ch0,ch1,random,average + normal: bool=True, + norm_type: str='linear', + norm_mul_factor: float=1.0, + offset: float=0.0, + duration: Optional[int]=None, + dtype: str='float32', + resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]: + """Load audio file from disk. This function loads audio from disk using using audio beackend. + + Args: + file (os.PathLike): Path of auido file to load. + sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None. + mono (bool, optional): Return waveform with mono channel. Defaults to True. + merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'. + normal (bool, optional): Waveform normalization. Defaults to True. + norm_type (str, optional): Type of normalization. Defaults to 'linear'. + norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0. + offset (float, optional): Offset to the start of waveform. Defaults to 0.0. + duration (Optional[int], optional): Duration of waveform to read. Defaults to None. + dtype (str, optional): Data type of waveform. Defaults to 'float32'. + resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'. + + Returns: + Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate. + """ + + y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration) + + if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)): + raise ParameterError(f'audio file {file} looks empty') + + if mono: + y = to_mono(y, merge_type) + + if sr is not None and sr != r: + y = resample(y, r, sr, mode=resample_mode) + r = sr + + if normal: + y = normalize(y, norm_type, norm_mul_factor) + elif dtype in ['int8', 'int16']: + # still need to do normalization, before depth convertion + y = normalize(y, 'linear', 1.0) + + y = depth_convert(y, dtype) + return y, r + +#the code below token form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py with modificaion. + +def _get_subtype_for_wav(dtype: paddle.dtype, encoding: str, bits_per_sample: int): + if not encoding: + if not bits_per_sample: + subtype = { + paddle.uint8: "PCM_U8", + paddle.int16: "PCM_16", + paddle.int32: "PCM_32", + paddle.float32: "FLOAT", + paddle.float64: "DOUBLE", + }.get(dtype) + if not subtype: + raise ValueError(f"Unsupported dtype for wav: {dtype}") + return subtype + if bits_per_sample == 8: + return "PCM_U8" + return f"PCM_{bits_per_sample}" + if encoding == "PCM_S": + if not bits_per_sample: + return "PCM_32" + if bits_per_sample == 8: + raise ValueError("wav does not support 8-bit signed PCM encoding.") + return f"PCM_{bits_per_sample}" + if encoding == "PCM_U": + if bits_per_sample in (None, 8): + return "PCM_U8" + raise ValueError("wav only supports 8-bit unsigned PCM encoding.") + if encoding == "PCM_F": + if bits_per_sample in (None, 32): + return "FLOAT" + if bits_per_sample == 64: + return "DOUBLE" + raise ValueError("wav only supports 32/64-bit float PCM encoding.") + if encoding == "ULAW": + if bits_per_sample in (None, 8): + return "ULAW" + raise ValueError("wav only supports 8-bit mu-law encoding.") + if encoding == "ALAW": + if bits_per_sample in (None, 8): + return "ALAW" + raise ValueError("wav only supports 8-bit a-law encoding.") + raise ValueError(f"wav does not support {encoding}.") + + +def _get_subtype_for_sphere(encoding: str, bits_per_sample: int): + if encoding in (None, "PCM_S"): + return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32" + if encoding in ("PCM_U", "PCM_F"): + raise ValueError(f"sph does not support {encoding} encoding.") + if encoding == "ULAW": + if bits_per_sample in (None, 8): + return "ULAW" + raise ValueError("sph only supports 8-bit for mu-law encoding.") + if encoding == "ALAW": + return "ALAW" + raise ValueError(f"sph does not support {encoding}.") + + +def _get_subtype(dtype: paddle.dtype, format: str, encoding: str, bits_per_sample: int): + if format == "wav": + return _get_subtype_for_wav(dtype, encoding, bits_per_sample) + if format == "flac": + if encoding: + raise ValueError("flac does not support encoding.") + if not bits_per_sample: + return "PCM_16" + if bits_per_sample > 24: + raise ValueError("flac does not support bits_per_sample > 24.") + return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}" + if format in ("ogg", "vorbis"): + if encoding or bits_per_sample: + raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.") + return "VORBIS" + if format == "sph": + return _get_subtype_for_sphere(encoding, bits_per_sample) + if format in ("nis", "nist"): + return "PCM_16" + raise ValueError(f"Unsupported format: {format}") + +def save( + filepath: str, + src: paddle.Tensor, + sample_rate: int, + channels_first: bool = True, + compression: Optional[float] = None, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, +): + """Save audio data to file. + + Note: + The formats this function can handle depend on the soundfile installation. + This function is tested on the following formats; + + * WAV + + * 32-bit floating-point + * 32-bit signed integer + * 16-bit signed integer + * 8-bit unsigned integer + + * FLAC + * OGG/VORBIS + * SPHERE + + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, + + Args: + filepath (str or pathlib.Path): Path to audio file. + src (paddle.Tensor): Audio data to save. must be 2D tensor. + sample_rate (int): sampling rate + channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, + otherwise `[time, channel]`. + compression (float of None, optional): Not used. + It is here only for interface compatibility reson with "sox_io" backend. + format (str or None, optional): Override the audio format. + When ``filepath`` argument is path-like object, audio format is + inferred from file extension. If the file extension is missing or + different, you can specify the correct format with this argument. + + When ``filepath`` argument is file-like object, + this argument is required. + + Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``, + ``"flac"`` and ``"sph"``. + encoding (str or None, optional): Changes the encoding for supported formats. + This argument is effective only for supported formats, sush as + ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are; + + - ``"PCM_S"`` (signed integer Linear PCM) + - ``"PCM_U"`` (unsigned integer Linear PCM) + - ``"PCM_F"`` (floating point PCM) + - ``"ULAW"`` (mu-law) + - ``"ALAW"`` (a-law) + + bits_per_sample (int or None, optional): Changes the bit depth for the + supported formats. + When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``, + you can change the bit depth. + Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. + + Supported formats/encodings/bit depth/compression are: + + ``"wav"`` + - 32-bit floating-point PCM + - 32-bit signed integer PCM + - 24-bit signed integer PCM + - 16-bit signed integer PCM + - 8-bit unsigned integer PCM + - 8-bit mu-law + - 8-bit a-law + + Note: + Default encoding/bit depth is determined by the dtype of + the input Tensor. + + ``"flac"`` + - 8-bit + - 16-bit (default) + - 24-bit + + ``"ogg"``, ``"vorbis"`` + - Doesn't accept changing configuration. + + ``"sph"`` + - 8-bit signed integer PCM + - 16-bit signed integer PCM + - 24-bit signed integer PCM + - 32-bit signed integer PCM (default) + - 8-bit mu-law + - 8-bit a-law + - 16-bit a-law + - 24-bit a-law + - 32-bit a-law + + """ + if src.ndim != 2: + raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.") + if compression is not None: + warnings.warn( + '`save` function of "soundfile" backend does not support "compression" parameter. ' + "The argument is silently ignored." + ) + if hasattr(filepath, "write"): + if format is None: + raise RuntimeError("`format` is required when saving to file object.") + ext = format.lower() + else: + ext = str(filepath).split(".")[-1].lower() + + if bits_per_sample not in (None, 8, 16, 24, 32, 64): + raise ValueError("Invalid bits_per_sample.") + if bits_per_sample == 24: + warnings.warn( + "Saving audio with 24 bits per sample might warp samples near -1. " + "Using 16 bits per sample might be able to avoid this." + ) + subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample) + + # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format, + # so we extend the extensions manually here + if ext in ["nis", "nist", "sph"] and format is None: + format = "NIST" + + if channels_first: + src = src.t() + + soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format) + +_SUBTYPE2DTYPE = { + "PCM_S8": "int8", + "PCM_U8": "uint8", + "PCM_16": "int16", + "PCM_32": "int32", + "FLOAT": "float32", + "DOUBLE": "float64", +} + +def load( + filepath: str, + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, +) -> Tuple[paddle.Tensor, int]: + """Load audio data from file. + + Note: + The formats this function can handle depend on the soundfile installation. + This function is tested on the following formats; + + * WAV + + * 32-bit floating-point + * 32-bit signed integer + * 16-bit signed integer + * 8-bit unsigned integer + + * FLAC + * OGG/VORBIS + * SPHERE + + By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with + ``float32`` dtype and the shape of `[channel, time]`. + The samples are normalized to fit in the range of ``[-1.0, 1.0]``. + + When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit + signed integer and 8-bit unsigned integer (24-bit signed integer is not supported), + by providing ``normalize=False``, this function can return integer Tensor, where the samples + are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor + for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. + + ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as + ``flac`` and ``mp3``. + For these formats, this function always returns ``float32`` Tensor with values normalized to + ``[-1.0, 1.0]``. + + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend. + + Args: + filepath (path-like object or file-like object): + Source of audio data. + frame_offset (int, optional): + Number of frames to skip before start reading data. + num_frames (int, optional): + Maximum number of frames to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + This function may return the less number of frames if there is not enough + frames in the given file. + normalize (bool, optional): + When ``True``, this function always return ``float32``, and sample values are + normalized to ``[-1.0, 1.0]``. + If input file is integer WAV, giving ``False`` will change the resulting Tensor type to + integer type. + This argument has no effect for formats other than integer WAV type. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Not used. PySoundFile does not accept format hint. + + Returns: + (paddle.Tensor, int): Resulting Tensor and sample rate. + If the input file has integer wav format and normalization is off, then it has + integer type, else ``float32`` type. If ``channels_first=True``, it has + `[channel, time]` else `[time, channel]`. + """ + with soundfile.SoundFile(filepath, "r") as file_: + if file_.format != "WAV" or normalize: + dtype = "float32" + elif file_.subtype not in _SUBTYPE2DTYPE: + raise ValueError(f"Unsupported subtype: {file_.subtype}") + else: + dtype = _SUBTYPE2DTYPE[file_.subtype] + + frames = file_._prepare_read(frame_offset, None, num_frames) + waveform = file_.read(frames, dtype, always_2d=True) + sample_rate = file_.samplerate + + waveform = paddle.to_tensor(waveform) + if channels_first: + waveform = paddle.transpose(waveform, perm=[1,0]) + return waveform, sample_rate + + +# Mapping from soundfile subtype to number of bits per sample. +# This is mostly heuristical and the value is set to 0 when it is irrelevant +# (lossy formats) or when it can't be inferred. +# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard: +# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony, +# the default seems to be 8 bits but it can be compressed further to 4 bits. +# The dict is inspired from +# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94 +_SUBTYPE_TO_BITS_PER_SAMPLE = { + "PCM_S8": 8, # Signed 8 bit data + "PCM_16": 16, # Signed 16 bit data + "PCM_24": 24, # Signed 24 bit data + "PCM_32": 32, # Signed 32 bit data + "PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only) + "FLOAT": 32, # 32 bit float data + "DOUBLE": 64, # 64 bit float data + "ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types + "ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types + "IMA_ADPCM": 0, # IMA ADPCM. + "MS_ADPCM": 0, # Microsoft ADPCM. + "GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate) + "VOX_ADPCM": 0, # OKI / Dialogix ADPCM + "G721_32": 0, # 32kbs G721 ADPCM encoding. + "G723_24": 0, # 24kbs G723 ADPCM encoding. + "G723_40": 0, # 40kbs G723 ADPCM encoding. + "DWVW_12": 12, # 12 bit Delta Width Variable Word encoding. + "DWVW_16": 16, # 16 bit Delta Width Variable Word encoding. + "DWVW_24": 24, # 24 bit Delta Width Variable Word encoding. + "DWVW_N": 0, # N bit Delta Width Variable Word encoding. + "DPCM_8": 8, # 8 bit differential PCM (XI only) + "DPCM_16": 16, # 16 bit differential PCM (XI only) + "VORBIS": 0, # Xiph Vorbis encoding. (lossy) + "ALAC_16": 16, # Apple Lossless Audio Codec (16 bit). + "ALAC_20": 20, # Apple Lossless Audio Codec (20 bit). + "ALAC_24": 24, # Apple Lossless Audio Codec (24 bit). + "ALAC_32": 32, # Apple Lossless Audio Codec (32 bit). +} + +def _get_bit_depth(subtype): + if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE: + warnings.warn( + f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample " + "attribute will be set to 0. If you are seeing this warning, please " + "report by opening an issue on github (after checking for existing/closed ones). " + "You may otherwise ignore this warning." + ) + return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0) + +_SUBTYPE_TO_ENCODING = { + "PCM_S8": "PCM_S", + "PCM_16": "PCM_S", + "PCM_24": "PCM_S", + "PCM_32": "PCM_S", + "PCM_U8": "PCM_U", + "FLOAT": "PCM_F", + "DOUBLE": "PCM_F", + "ULAW": "ULAW", + "ALAW": "ALAW", + "VORBIS": "VORBIS", +} + +def _get_encoding(format: str, subtype: str): + if format == "FLAC": + return "FLAC" + return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN") + +def info(filepath: str, format: Optional[str] = None) -> AudioInfo: + """Get signal information of an audio file. + + Note: + ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts + ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, + + Args: + filepath (path-like object or file-like object): + Source of audio data. + format (str or None, optional): + Not used. PySoundFile does not accept format hint. + + Returns: + AudioInfo: meta data of the given audio. + + """ + sinfo = soundfile.info(filepath) + return AudioInfo( + sinfo.samplerate, + sinfo.frames, + sinfo.channels, + bits_per_sample=_get_bit_depth(sinfo.subtype), + encoding=_get_encoding(sinfo.format, sinfo.subtype), + ) \ No newline at end of file diff --git a/audio/paddleaudio/backends/sox_io_backend.py b/audio/paddleaudio/backends/sox_io_backend.py new file mode 100644 index 000000000..8dabe75f5 --- /dev/null +++ b/audio/paddleaudio/backends/sox_io_backend.py @@ -0,0 +1,101 @@ +from pathlib import Path +from typing import Callable +from typing import Optional, Tuple, Union + +import paddle +import paddleaudio +from paddle import Tensor +from .common import AudioInfo +import os + +from paddleaudio._internal import module_utils as _mod_utils + +#https://github.com/pytorch/audio/blob/main/torchaudio/backend/sox_io_backend.py + +def _fail_info(filepath: str, format: Optional[str]) -> AudioInfo: + raise RuntimeError("Failed to fetch metadata from {}".format(filepath)) + + +def _fail_info_fileobj(fileobj, format: Optional[str]) -> AudioInfo: + raise RuntimeError("Failed to fetch metadata from {}".format(fileobj)) + + +# Note: need to comply TorchScript syntax -- need annotation and no f-string +def _fail_load( + filepath: str, + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, +) -> Tuple[Tensor, int]: + raise RuntimeError("Failed to load audio from {}".format(filepath)) + + +def _fail_load_fileobj(fileobj, *args, **kwargs): + raise RuntimeError(f"Failed to load audio from {fileobj}") + +_fallback_info = _fail_info +_fallback_info_fileobj = _fail_info_fileobj +_fallback_load = _fail_load +_fallback_load_filebj = _fail_load_fileobj + +@_mod_utils.requires_sox() +def load( + filepath: str, + frame_offset: int = 0, + num_frames: int=-1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str]=None, ) -> Tuple[Tensor, int]: + if hasattr(filepath, "read"): + ret = paddleaudio._paddleaudio.load_audio_fileobj( + filepath, frame_offset, num_frames, normalize, channels_first, format + ) + if ret is not None: + audio_tensor = paddle.to_tensor(ret[0]) + return (audio_tensor, ret[1]) + return _fallback_load_fileobj(filepath, frame_offset, num_frames, normalize, channels_first, format) + filepath = os.fspath(filepath) + ret = paddleaudio._paddleaudio.sox_io_load_audio_file( + filepath, frame_offset, num_frames, normalize, channels_first, format + ) + if ret is not None: + audio_tensor = paddle.to_tensor(ret[0]) + return (audio_tensor, ret[1]) + return _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, format) + + +@_mod_utils.requires_sox() +def save(filepath: str, + src: Tensor, + sample_rate: int, + channels_first: bool = True, + compression: Optional[float] = None, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, +): + src_arr = src.numpy() + if hasattr(filepath, "write"): + paddleaudio._paddleaudio.save_audio_fileobj( + filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample + ) + return + filepath = os.fspath(filepath) + paddleaudio._paddleaudio.sox_io_save_audio_file( + filepath, src_arr, sample_rate, channels_first, compression, format, encoding, bits_per_sample + ) + +@_mod_utils.requires_sox() +def info(filepath: str, format: Optional[str] = None,) -> AudioInfo: + if hasattr(filepath, "read"): + sinfo = paddleaudio._paddleaudio.get_info_fileobj(filepath, format) + if sinfo is not None: + return AudioInfo(*sinfo) + return _fallback_info_fileobj(filepath, format) + filepath = os.fspath(filepath) + sinfo = paddleaudio._paddleaudio.get_info_file(filepath, format) + if sinfo is not None: + return AudioInfo(*sinfo) + return _fallback_info(filepath, format) diff --git a/audio/paddleaudio/backends/utils.py b/audio/paddleaudio/backends/utils.py new file mode 100644 index 000000000..4a7e51c02 --- /dev/null +++ b/audio/paddleaudio/backends/utils.py @@ -0,0 +1,81 @@ +"""Defines utilities for switching audio backends""" +#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/utils.py + +import warnings +from typing import List +from typing import Optional + +import paddleaudio +from paddleaudio._internal import module_utils as _mod_utils + +from . import no_backend, soundfile_backend, sox_io_backend + +__all__ = [ + "list_audio_backends", + "get_audio_backend", + "set_audio_backend", +] + + +def list_audio_backends() -> List[str]: + """List available backends + + Returns: + List[str]: The list of available backends. + """ + backends = [] + if _mod_utils.is_module_available("soundfile"): + backends.append("soundfile") + if _mod_utils.is_sox_available(): + backends.append("sox_io") + return backends + + +def set_audio_backend(backend: Optional[str]): + """Set the backend for I/O operation + + Args: + backend (str or None): Name of the backend. + One of ``"sox_io"`` or ``"soundfile"`` based on availability + of the system. If ``None`` is provided the current backend is unassigned. + """ + if backend is not None and backend not in list_audio_backends(): + raise RuntimeError(f'Backend "{backend}" is not one of ' + f"available backends: {list_audio_backends()}.") + + if backend is None: + module = no_backend + elif backend == "sox_io": + module = sox_io_backend + elif backend == "soundfile": + module = soundfile_backend + else: + raise NotImplementedError(f'Unexpected backend "{backend}"') + + for func in ["save", "load", "info"]: + setattr(paddleaudio, func, getattr(module, func)) + +def _init_audio_backend(): + backends = list_audio_backends() + if "soundfile" in backends: + set_audio_backend("soundfile") + elif "sox_io" in backends: + set_audio_backend("sox_io") + else: + warnings.warn("No audio backend is available.") + set_audio_backend(None) + + +def get_audio_backend() -> Optional[str]: + """Get the name of the current backend + + Returns: + Optional[str]: The name of the current backend or ``None`` if no backend is assigned. + """ + if paddleaudio.load == no_backend.load: + return None + if paddleaudio.load == sox_io_backend.load: + return "sox_io" + if paddleaudio.load == soundfile_backend.load: + return "soundfile" + raise ValueError("Unknown backend.") diff --git a/audio/paddleaudio/compliance/__init__.py b/audio/paddleaudio/compliance/__init__.py new file mode 100644 index 000000000..c08f9ab11 --- /dev/null +++ b/audio/paddleaudio/compliance/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from . import kaldi +from . import librosa diff --git a/audio/paddleaudio/compliance/kaldi.py b/audio/paddleaudio/compliance/kaldi.py new file mode 100644 index 000000000..538be0196 --- /dev/null +++ b/audio/paddleaudio/compliance/kaldi.py @@ -0,0 +1,638 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from torchaudio(https://github.com/pytorch/audio) +import math +from typing import Tuple + +import paddle +from paddle import Tensor + +from ..functional import create_dct +from ..functional.window import get_window + +__all__ = [ + 'spectrogram', + 'fbank', + 'mfcc', +] + +# window types +HANNING = 'hann' +HAMMING = 'hamming' +POVEY = 'povey' +RECTANGULAR = 'rect' +BLACKMAN = 'blackman' + + +def _get_epsilon(dtype): + return paddle.to_tensor(1e-07, dtype=dtype) + + +def _next_power_of_2(x: int) -> int: + return 1 if x == 0 else 2**(x - 1).bit_length() + + +def _get_strided(waveform: Tensor, + window_size: int, + window_shift: int, + snip_edges: bool) -> Tensor: + assert waveform.dim() == 1 + num_samples = waveform.shape[0] + + if snip_edges: + if num_samples < window_size: + return paddle.empty((0, 0), dtype=waveform.dtype) + else: + m = 1 + (num_samples - window_size) // window_shift + else: + reversed_waveform = paddle.flip(waveform, [0]) + m = (num_samples + (window_shift // 2)) // window_shift + pad = window_size // 2 - window_shift // 2 + pad_right = reversed_waveform + if pad > 0: + pad_left = reversed_waveform[-pad:] + waveform = paddle.concat((pad_left, waveform, pad_right), axis=0) + else: + waveform = paddle.concat((waveform[-pad:], pad_right), axis=0) + + return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T + + +def _feature_window_function( + window_type: str, + window_size: int, + blackman_coeff: float, + dtype: int, ) -> Tensor: + if window_type == HANNING: + return get_window('hann', window_size, fftbins=False, dtype=dtype) + elif window_type == HAMMING: + return get_window('hamming', window_size, fftbins=False, dtype=dtype) + elif window_type == POVEY: + return get_window( + 'hann', window_size, fftbins=False, dtype=dtype).pow(0.85) + elif window_type == RECTANGULAR: + return paddle.ones([window_size], dtype=dtype) + elif window_type == BLACKMAN: + a = 2 * math.pi / (window_size - 1) + window_function = paddle.arange(window_size, dtype=dtype) + return (blackman_coeff - 0.5 * paddle.cos(a * window_function) + + (0.5 - blackman_coeff) * paddle.cos(2 * a * window_function) + ).astype(dtype) + else: + raise Exception('Invalid window type ' + window_type) + + +def _get_log_energy(strided_input: Tensor, epsilon: Tensor, + energy_floor: float) -> Tensor: + log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log() + if energy_floor == 0.0: + return log_energy + return paddle.maximum( + log_energy, + paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype)) + + +def _get_waveform_and_window_properties( + waveform: Tensor, + channel: int, + sr: int, + frame_shift: float, + frame_length: float, + round_to_power_of_two: bool, + preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]: + channel = max(channel, 0) + assert channel < waveform.shape[0], ( + 'Invalid channel {} for size {}'.format(channel, waveform.shape[0])) + waveform = waveform[channel, :] # size (n) + window_shift = int( + sr * frame_shift * + 0.001) # pass frame_shift and frame_length in milliseconds + window_size = int(sr * frame_length * 0.001) + padded_window_size = _next_power_of_2( + window_size) if round_to_power_of_two else window_size + + assert 2 <= window_size <= len(waveform), ( + 'choose a window size {} that is [2, {}]'.format(window_size, + len(waveform))) + assert 0 < window_shift, '`window_shift` must be greater than 0' + assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \ + ' use `round_to_power_of_two` or change `frame_length`' + assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]' + assert sr > 0, '`sr` must be greater than zero' + return waveform, window_shift, window_size, padded_window_size + + +def _get_window(waveform: Tensor, + padded_window_size: int, + window_size: int, + window_shift: int, + window_type: str, + blackman_coeff: float, + snip_edges: bool, + raw_energy: bool, + energy_floor: float, + dither: float, + remove_dc_offset: bool, + preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]: + dtype = waveform.dtype + epsilon = _get_epsilon(dtype) + + # (m, window_size) + strided_input = _get_strided(waveform, window_size, window_shift, + snip_edges) + + if dither != 0.0: + x = paddle.maximum(epsilon, + paddle.rand(strided_input.shape, dtype=dtype)) + rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x) + strided_input = strided_input + rand_gauss * dither + + if remove_dc_offset: + row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1) + strided_input = strided_input - row_means + + if raw_energy: + signal_log_energy = _get_log_energy(strided_input, epsilon, + energy_floor) # (m) + + if preemphasis_coefficient != 0.0: + offset_strided_input = paddle.nn.functional.pad( + strided_input.unsqueeze(0), (1, 0), + data_format='NCL', + mode='replicate').squeeze(0) # (m, window_size + 1) + strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, : + -1] + + window_function = _feature_window_function( + window_type, window_size, blackman_coeff, + dtype).unsqueeze(0) # (1, window_size) + strided_input = strided_input * window_function # (m, window_size) + + # (m, padded_window_size) + if padded_window_size != window_size: + padding_right = padded_window_size - window_size + strided_input = paddle.nn.functional.pad( + strided_input.unsqueeze(0), (0, padding_right), + data_format='NCL', + mode='constant', + value=0).squeeze(0) + + if not raw_energy: + signal_log_energy = _get_log_energy(strided_input, epsilon, + energy_floor) # size (m) + + return strided_input, signal_log_energy + + +def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor: + if subtract_mean: + col_means = paddle.mean(tensor, axis=0).unsqueeze(0) + tensor = tensor - col_means + return tensor + + +def spectrogram(waveform: Tensor, + blackman_coeff: float=0.42, + channel: int=-1, + dither: float=0.0, + energy_floor: float=1.0, + frame_length: float=25.0, + frame_shift: float=10.0, + preemphasis_coefficient: float=0.97, + raw_energy: bool=True, + remove_dc_offset: bool=True, + round_to_power_of_two: bool=True, + sr: int=16000, + snip_edges: bool=True, + subtract_mean: bool=False, + window_type: str=POVEY) -> Tensor: + """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. + + Args: + waveform (Tensor): A waveform tensor with shape `(C, T)`. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + + Returns: + Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames + depends on frame_length and frame_shift. + """ + dtype = waveform.dtype + epsilon = _get_epsilon(dtype) + + waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( + waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, + preemphasis_coefficient) + + strided_input, signal_log_energy = _get_window( + waveform, padded_window_size, window_size, window_shift, window_type, + blackman_coeff, snip_edges, raw_energy, energy_floor, dither, + remove_dc_offset, preemphasis_coefficient) + + # (m, padded_window_size // 2 + 1, 2) + fft = paddle.fft.rfft(strided_input) + + power_spectrum = paddle.maximum( + fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1) + power_spectrum[:, 0] = signal_log_energy + + power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean) + return power_spectrum + + +def _inverse_mel_scale_scalar(mel_freq: float) -> float: + return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0) + + +def _inverse_mel_scale(mel_freq: Tensor) -> Tensor: + return 700.0 * ((mel_freq / 1127.0).exp() - 1.0) + + +def _mel_scale_scalar(freq: float) -> float: + return 1127.0 * math.log(1.0 + freq / 700.0) + + +def _mel_scale(freq: Tensor) -> Tensor: + return 1127.0 * (1.0 + freq / 700.0).log() + + +def _vtln_warp_freq(vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq: float, + high_freq: float, + vtln_warp_factor: float, + freq: Tensor) -> Tensor: + assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq' + assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]' + l = vtln_low_cutoff * max(1.0, vtln_warp_factor) + h = vtln_high_cutoff * min(1.0, vtln_warp_factor) + scale = 1.0 / vtln_warp_factor + Fl = scale * l + Fh = scale * h + assert l > low_freq and h < high_freq + scale_left = (Fl - low_freq) / (l - low_freq) + scale_right = (high_freq - Fh) / (high_freq - h) + res = paddle.empty_like(freq) + + outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \ + | paddle.greater_than(freq, paddle.to_tensor(high_freq)) + before_l = paddle.less_than(freq, paddle.to_tensor(l)) + before_h = paddle.less_than(freq, paddle.to_tensor(h)) + after_h = paddle.greater_equal(freq, paddle.to_tensor(h)) + + res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq) + res[before_h] = scale * freq[before_h] + res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq) + res[outside_low_high_freq] = freq[outside_low_high_freq] + + return res + + +def _vtln_warp_mel_freq(vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq, + high_freq: float, + vtln_warp_factor: float, + mel_freq: Tensor) -> Tensor: + return _mel_scale( + _vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, + vtln_warp_factor, _inverse_mel_scale(mel_freq))) + + +def _get_mel_banks(num_bins: int, + window_length_padded: int, + sample_freq: float, + low_freq: float, + high_freq: float, + vtln_low: float, + vtln_high: float, + vtln_warp_factor: float) -> Tuple[Tensor, Tensor]: + assert num_bins > 3, 'Must have at least 3 mel bins' + assert window_length_padded % 2 == 0 + num_fft_bins = window_length_padded / 2 + nyquist = 0.5 * sample_freq + + if high_freq <= 0.0: + high_freq += nyquist + + assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \ + ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist)) + + fft_bin_width = sample_freq / window_length_padded + mel_low_freq = _mel_scale_scalar(low_freq) + mel_high_freq = _mel_scale_scalar(high_freq) + + mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1) + + if vtln_high < 0.0: + vtln_high += nyquist + + assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and + (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \ + ('Bad values in options: vtln-low {} and vtln-high {}, versus ' + 'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq)) + + bin = paddle.arange(num_bins).unsqueeze(1) + left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) + center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) + right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) + + if vtln_warp_factor != 1.0: + left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, left_mel) + center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, + high_freq, vtln_warp_factor, + center_mel) + right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, + high_freq, vtln_warp_factor, right_mel) + + center_freqs = _inverse_mel_scale(center_mel) # (num_bins) + # (1, num_fft_bins) + mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0) + + # (num_bins, num_fft_bins) + up_slope = (mel - left_mel) / (center_mel - left_mel) + down_slope = (right_mel - mel) / (right_mel - center_mel) + + if vtln_warp_factor == 1.0: + bins = paddle.maximum( + paddle.zeros([1]), paddle.minimum(up_slope, down_slope)) + else: + bins = paddle.zeros_like(up_slope) + up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than( + mel, center_mel) + down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than( + mel, right_mel) + bins[up_idx] = up_slope[up_idx] + bins[down_idx] = down_slope[down_idx] + + return bins, center_freqs + + +def fbank(waveform: Tensor, + blackman_coeff: float=0.42, + channel: int=-1, + dither: float=0.0, + energy_floor: float=1.0, + frame_length: float=25.0, + frame_shift: float=10.0, + high_freq: float=0.0, + htk_compat: bool=False, + low_freq: float=20.0, + n_mels: int=23, + preemphasis_coefficient: float=0.97, + raw_energy: bool=True, + remove_dc_offset: bool=True, + round_to_power_of_two: bool=True, + sr: int=16000, + snip_edges: bool=True, + subtract_mean: bool=False, + use_energy: bool=False, + use_log_fbank: bool=True, + use_power: bool=True, + vtln_high: float=-500.0, + vtln_low: float=100.0, + vtln_warp: float=1.0, + window_type: str=POVEY) -> Tensor: + """Compute and return filter banks from a waveform. The output is identical to Kaldi's. + + Args: + waveform (Tensor): A waveform tensor with shape `(C, T)`. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. + htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. + low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. + n_mels (int, optional): Number of output mel bins. Defaults to 23. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. + use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True. + use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True. + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. + vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + + Returns: + Tensor: A filter banks tensor with shape `(m, n_mels)`. + """ + dtype = waveform.dtype + + waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( + waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, + preemphasis_coefficient) + + strided_input, signal_log_energy = _get_window( + waveform, padded_window_size, window_size, window_shift, window_type, + blackman_coeff, snip_edges, raw_energy, energy_floor, dither, + remove_dc_offset, preemphasis_coefficient) + + # (m, padded_window_size // 2 + 1) + spectrum = paddle.fft.rfft(strided_input).abs() + if use_power: + spectrum = spectrum.pow(2.) + + # (n_mels, padded_window_size // 2) + mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq, + high_freq, vtln_low, vtln_high, vtln_warp) + mel_energies = mel_energies.astype(dtype) + + # (n_mels, padded_window_size // 2 + 1) + mel_energies = paddle.nn.functional.pad( + mel_energies.unsqueeze(0), (0, 1), + data_format='NCL', + mode='constant', + value=0).squeeze(0) + + # (m, n_mels) + mel_energies = paddle.mm(spectrum, mel_energies.T) + if use_log_fbank: + mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log() + + if use_energy: + signal_log_energy = signal_log_energy.unsqueeze(1) + if htk_compat: + mel_energies = paddle.concat( + (mel_energies, signal_log_energy), axis=1) + else: + mel_energies = paddle.concat( + (signal_log_energy, mel_energies), axis=1) + + # (m, n_mels + 1) + mel_energies = _subtract_column_mean(mel_energies, subtract_mean) + return mel_energies + + +def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor: + dct_matrix = create_dct(n_mels, n_mels, 'ortho') + dct_matrix[:, 0] = math.sqrt(1 / float(n_mels)) + dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc) + return dct_matrix + + +def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor: + i = paddle.arange(n_mfcc) + return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i / + cepstral_lifter) + + +def mfcc(waveform: Tensor, + blackman_coeff: float=0.42, + cepstral_lifter: float=22.0, + channel: int=-1, + dither: float=0.0, + energy_floor: float=1.0, + frame_length: float=25.0, + frame_shift: float=10.0, + high_freq: float=0.0, + htk_compat: bool=False, + low_freq: float=20.0, + n_mfcc: int=13, + n_mels: int=23, + preemphasis_coefficient: float=0.97, + raw_energy: bool=True, + remove_dc_offset: bool=True, + round_to_power_of_two: bool=True, + sr: int=16000, + snip_edges: bool=True, + subtract_mean: bool=False, + use_energy: bool=False, + vtln_high: float=-500.0, + vtln_low: float=100.0, + vtln_warp: float=1.0, + window_type: str=POVEY) -> Tensor: + """Compute and return mel frequency cepstral coefficients from a waveform. The output is + identical to Kaldi's. + + Args: + waveform (Tensor): A waveform tensor with shape `(C, T)`. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. + htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. + low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13. + n_mels (int, optional): Number of output mel bins. Defaults to 23. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. + vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + + Returns: + Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`. + """ + assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( + n_mfcc, n_mels) + + dtype = waveform.dtype + + # (m, n_mels + use_energy) + feature = fbank( + waveform=waveform, + blackman_coeff=blackman_coeff, + channel=channel, + dither=dither, + energy_floor=energy_floor, + frame_length=frame_length, + frame_shift=frame_shift, + high_freq=high_freq, + htk_compat=htk_compat, + low_freq=low_freq, + n_mels=n_mels, + preemphasis_coefficient=preemphasis_coefficient, + raw_energy=raw_energy, + remove_dc_offset=remove_dc_offset, + round_to_power_of_two=round_to_power_of_two, + sr=sr, + snip_edges=snip_edges, + subtract_mean=False, + use_energy=use_energy, + use_log_fbank=True, + use_power=True, + vtln_high=vtln_high, + vtln_low=vtln_low, + vtln_warp=vtln_warp, + window_type=window_type) + + if use_energy: + # (m) + signal_log_energy = feature[:, n_mels if htk_compat else 0] + mel_offset = int(not htk_compat) + feature = feature[:, mel_offset:(n_mels + mel_offset)] + + # (n_mels, n_mfcc) + dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype) + + # (m, n_mfcc) + feature = feature.matmul(dct_matrix) + + if cepstral_lifter != 0.0: + # (1, n_mfcc) + lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0) + feature *= lifter_coeffs.astype(dtype=dtype) + + if use_energy: + feature[:, 0] = signal_log_energy + + if htk_compat: + energy = feature[:, 0].unsqueeze(1) # (m, 1) + feature = feature[:, 1:] # (m, n_mfcc - 1) + if not use_energy: + energy *= math.sqrt(2) + + feature = paddle.concat((feature, energy), axis=1) + + feature = _subtract_column_mean(feature, subtract_mean) + return feature diff --git a/audio/paddleaudio/compliance/librosa.py b/audio/paddleaudio/compliance/librosa.py new file mode 100644 index 000000000..168632d7c --- /dev/null +++ b/audio/paddleaudio/compliance/librosa.py @@ -0,0 +1,788 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from librosa(https://github.com/librosa/librosa) +import warnings +from typing import List +from typing import Optional +from typing import Union + +import numpy as np +import scipy +from numpy.lib.stride_tricks import as_strided +from scipy import signal + +from ..backends import depth_convert +from ..utils import ParameterError + +__all__ = [ + # dsp + 'stft', + 'mfcc', + 'hz_to_mel', + 'mel_to_hz', + 'mel_frequencies', + 'power_to_db', + 'compute_fbank_matrix', + 'melspectrogram', + 'spectrogram', + 'mu_encode', + 'mu_decode', + # augmentation + 'depth_augment', + 'spect_augment', + 'random_crop1d', + 'random_crop2d', + 'adaptive_spect_augment', +] + + +def _pad_center(data: np.ndarray, size: int, axis: int=-1, + **kwargs) -> np.ndarray: + """Pad an array to a target length along a target axis. + + This differs from `np.pad` by centering the data prior to padding, + analogous to `str.center` + """ + + kwargs.setdefault("mode", "constant") + n = data.shape[axis] + lpad = int((size - n) // 2) + lengths = [(0, 0)] * data.ndim + lengths[axis] = (lpad, int(size - n - lpad)) + + if lpad < 0: + raise ParameterError(("Target size ({size:d}) must be " + "at least input size ({n:d})")) + + return np.pad(data, lengths, **kwargs) + + +def _split_frames(x: np.ndarray, + frame_length: int, + hop_length: int, + axis: int=-1) -> np.ndarray: + """Slice a data array into (overlapping) frames. + + This function is aligned with librosa.frame + """ + + if not isinstance(x, np.ndarray): + raise ParameterError( + f"Input must be of type numpy.ndarray, given type(x)={type(x)}") + + if x.shape[axis] < frame_length: + raise ParameterError(f"Input is too short (n={x.shape[axis]:d})" + f" for frame_length={frame_length:d}") + + if hop_length < 1: + raise ParameterError(f"Invalid hop_length: {hop_length:d}") + + if axis == -1 and not x.flags["F_CONTIGUOUS"]: + warnings.warn(f"librosa.util.frame called with axis={axis} " + "on a non-contiguous input. This will result in a copy.") + x = np.asfortranarray(x) + elif axis == 0 and not x.flags["C_CONTIGUOUS"]: + warnings.warn(f"librosa.util.frame called with axis={axis} " + "on a non-contiguous input. This will result in a copy.") + x = np.ascontiguousarray(x) + + n_frames = 1 + (x.shape[axis] - frame_length) // hop_length + strides = np.asarray(x.strides) + + new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize + + if axis == -1: + shape = list(x.shape)[:-1] + [frame_length, n_frames] + strides = list(strides) + [hop_length * new_stride] + + elif axis == 0: + shape = [n_frames, frame_length] + list(x.shape)[1:] + strides = [hop_length * new_stride] + list(strides) + + else: + raise ParameterError(f"Frame axis={axis} must be either 0 or -1") + + return as_strided(x, shape=shape, strides=strides) + + +def _check_audio(y, mono=True) -> bool: + """Determine whether a variable contains valid audio data. + + The audio y must be a np.ndarray, ether 1-channel or two channel + """ + if not isinstance(y, np.ndarray): + raise ParameterError("Audio data must be of type numpy.ndarray") + if y.ndim > 2: + raise ParameterError( + f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}") + + if mono and y.ndim == 2: + raise ParameterError( + f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}") + + if (mono and len(y) == 0) or (not mono and y.shape[1] < 0): + raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}") + + if not np.issubdtype(y.dtype, np.floating): + raise ParameterError("Audio data must be floating-point") + + if not np.isfinite(y).all(): + raise ParameterError("Audio buffer is not finite everywhere") + + return True + + +def hz_to_mel(frequencies: Union[float, List[float], np.ndarray], + htk: bool=False) -> np.ndarray: + """Convert Hz to Mels. + + Args: + frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Frequency in mels. + """ + freq = np.asanyarray(frequencies) + + if htk: + return 2595.0 * np.log10(1.0 + freq / 700.0) + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (freq - f_min) / f_sp + + # Fill in the log-scale part + + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region + + if freq.ndim: + # If we have array data, vectorize + log_t = freq >= min_log_hz + mels[log_t] = min_log_mel + \ + np.log(freq[log_t] / min_log_hz) / logstep + elif freq >= min_log_hz: + # If we have scalar data, heck directly + mels = min_log_mel + np.log(freq / min_log_hz) / logstep + + return mels + + +def mel_to_hz(mels: Union[float, List[float], np.ndarray], + htk: int=False) -> np.ndarray: + """Convert mel bin numbers to frequencies. + + Args: + mels (Union[float, List[float], np.ndarray]): Frequency in mels. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Frequencies in Hz. + """ + mel_array = np.asanyarray(mels) + + if htk: + return 700.0 * (10.0**(mel_array / 2595.0) - 1.0) + + # Fill in the linear scale + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mel_array + + # And now the nonlinear scale + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region + + if mel_array.ndim: + # If we have vector data, vectorize + log_t = mel_array >= min_log_mel + freqs[log_t] = min_log_hz * \ + np.exp(logstep * (mel_array[log_t] - min_log_mel)) + elif mel_array >= min_log_mel: + # If we have scalar data, check directly + freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel)) + + return freqs + + +def mel_frequencies(n_mels: int=128, + fmin: float=0.0, + fmax: float=11025.0, + htk: bool=False) -> np.ndarray: + """Compute mel frequencies. + + Args: + n_mels (int, optional): Number of mel bins. Defaults to 128. + fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`. + """ + # 'Center freqs' of mel bands - uniformly spaced between limits + min_mel = hz_to_mel(fmin, htk=htk) + max_mel = hz_to_mel(fmax, htk=htk) + + mels = np.linspace(min_mel, max_mel, n_mels) + + return mel_to_hz(mels, htk=htk) + + +def fft_frequencies(sr: int, n_fft: int) -> np.ndarray: + """Compute fourier frequencies. + + Args: + sr (int): Sample rate. + n_fft (int): FFT size. + + Returns: + np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. + """ + return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True) + + +def compute_fbank_matrix(sr: int, + n_fft: int, + n_mels: int=128, + fmin: float=0.0, + fmax: Optional[float]=None, + htk: bool=False, + norm: str="slaney", + dtype: type=np.float32) -> np.ndarray: + """Compute fbank matrix. + + Args: + sr (int): Sample rate. + n_fft (int): FFT size. + n_mels (int, optional): Number of mel bins. Defaults to 128. + fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use htk scaling. Defaults to False. + norm (str, optional): Type of normalization. Defaults to "slaney". + dtype (type, optional): Data type. Defaults to np.float32. + + + Returns: + np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. + """ + if norm != "slaney": + raise ParameterError('norm must set to slaney') + + if fmax is None: + fmax = float(sr) / 2 + + # Initialize the weights + n_mels = int(n_mels) + weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft) + + # 'Center freqs' of mel bands - uniformly spaced between limits + mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk) + + fdiff = np.diff(mel_f) + ramps = np.subtract.outer(mel_f, fftfreqs) + + for i in range(n_mels): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i + 2] / fdiff[i + 1] + + # .. then intersect them with each other and zero + weights[i] = np.maximum(0, np.minimum(lower, upper)) + + if norm == "slaney": + # Slaney-style mel is scaled to be approx constant energy per channel + enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) + weights *= enorm[:, np.newaxis] + + # Only check weights if f_mel[0] is positive + if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)): + # This means we have an empty channel somewhere + warnings.warn("Empty filters detected in mel frequency basis. " + "Some channels will produce empty responses. " + "Try increasing your sampling rate (and fmax) or " + "reducing n_mels.") + + return weights + + +def stft(x: np.ndarray, + n_fft: int=2048, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str="hann", + center: bool=True, + dtype: type=np.complex64, + pad_mode: str="reflect") -> np.ndarray: + """Short-time Fourier transform (STFT). + + Args: + x (np.ndarray): Input waveform in one dimension. + n_fft (int, optional): FFT size. Defaults to 2048. + hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None. + win_length (Optional[int], optional): The size of window. Defaults to None. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + dtype (type, optional): Data type of STFT results. Defaults to np.complex64. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + + Returns: + np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`. + """ + _check_audio(x) + + # By default, use the entire frame + if win_length is None: + win_length = n_fft + + # Set the default hop, if it's not already specified + if hop_length is None: + hop_length = int(win_length // 4) + + fft_window = signal.get_window(window, win_length, fftbins=True) + + # Pad the window out to n_fft size + fft_window = _pad_center(fft_window, n_fft) + + # Reshape so that the window can be broadcast + fft_window = fft_window.reshape((-1, 1)) + + # Pad the time series so that frames are centered + if center: + if n_fft > x.shape[-1]: + warnings.warn( + f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}" + ) + x = np.pad(x, int(n_fft // 2), mode=pad_mode) + + elif n_fft > x.shape[-1]: + raise ParameterError( + f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}" + ) + + # Window the time series. + x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length) + # Pre-allocate the STFT matrix + stft_matrix = np.empty( + (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F") + fft = np.fft # use numpy fft as default + # Constrain STFT block sizes to 256 KB + MAX_MEM_BLOCK = 2**8 * 2**10 + # how many columns can we fit within MAX_MEM_BLOCK? + n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize) + n_columns = max(n_columns, 1) + + for bl_s in range(0, stft_matrix.shape[1], n_columns): + bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) + stft_matrix[:, bl_s:bl_t] = fft.rfft( + fft_window * x_frames[:, bl_s:bl_t], axis=0) + + return stft_matrix + + +def power_to_db(spect: np.ndarray, + ref: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=80.0) -> np.ndarray: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. + + Args: + spect (np.ndarray): STFT power spectrogram of an input waveform. + ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0. + + Returns: + np.ndarray: Power spectrogram in db scale. + """ + spect = np.asarray(spect) + + if amin <= 0: + raise ParameterError("amin must be strictly positive") + + if np.issubdtype(spect.dtype, np.complexfloating): + warnings.warn( + "power_to_db was called on complex input so phase " + "information will be discarded. To suppress this warning, " + "call power_to_db(np.abs(D)**2) instead.") + magnitude = np.abs(spect) + else: + magnitude = spect + + if callable(ref): + # User supplied a function to calculate reference power + ref_value = ref(magnitude) + else: + ref_value = np.abs(ref) + + log_spec = 10.0 * np.log10(np.maximum(amin, magnitude)) + log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value)) + + if top_db is not None: + if top_db < 0: + raise ParameterError("top_db must be non-negative") + log_spec = np.maximum(log_spec, log_spec.max() - top_db) + + return log_spec + + +def mfcc(x: np.ndarray, + sr: int=16000, + spect: Optional[np.ndarray]=None, + n_mfcc: int=20, + dct_type: int=2, + norm: str="ortho", + lifter: int=0, + **kwargs) -> np.ndarray: + """Mel-frequency cepstral coefficients (MFCCs) + + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20. + dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2. + norm (str, optional): Type of normalization. Defaults to "ortho". + lifter (int, optional): Cepstral filtering. Defaults to 0. + + Returns: + np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`. + """ + if spect is None: + spect = melspectrogram(x, sr=sr, **kwargs) + + M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc] + + if lifter > 0: + factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) / + lifter) + return M * factor[:, np.newaxis] + elif lifter == 0: + return M + else: + raise ParameterError( + f"MFCC lifter={lifter} must be a non-negative number") + + +def melspectrogram(x: np.ndarray, + sr: int=16000, + window_size: int=512, + hop_length: int=320, + n_mels: int=64, + fmin: float=50.0, + fmax: Optional[float]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + power: float=2.0, + to_db: bool=True, + ref: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None) -> np.ndarray: + """Compute mel-spectrogram. + + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + window_size (int, optional): Size of FFT and window length. Defaults to 512. + hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. + n_mels (int, optional): Number of mel bins. Defaults to 64. + fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0. + fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. + to_db (bool, optional): Enable db scale. Defaults to True. + ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. + + Returns: + np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`. + """ + _check_audio(x, mono=True) + if len(x) <= 0: + raise ParameterError('The input waveform is empty') + + if fmax is None: + fmax = sr // 2 + if fmin < 0 or fmin >= fmax: + raise ParameterError('fmin and fmax must statisfy 0 np.ndarray: + """Compute spectrogram. + + Args: + x (np.ndarray): Input waveform in one dimension. + sr (int, optional): Sample rate. Defaults to 16000. + window_size (int, optional): Size of FFT and window length. Defaults to 512. + hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320. + window (str, optional): A string of window specification. Defaults to "hann". + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect". + power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0. + + Returns: + np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`. + """ + + s = stft( + x, + n_fft=window_size, + hop_length=hop_length, + win_length=window_size, + window=window, + center=center, + pad_mode=pad_mode) + + return np.abs(s)**power + + +def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: + """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`. + + Args: + x (np.ndarray): The input waveform to encode. + mu (int, optional): The endoceding parameter. Defaults to 255. + quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True. + + Returns: + np.ndarray: The mu-law encoded waveform. + """ + mu = 255 + y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu) + if quantized: + y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1] + return y + + +def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray: + """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise. + + Args: + y (np.ndarray): The encoded waveform. + mu (int, optional): The endoceding parameter. Defaults to 255. + quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True. + + Returns: + np.ndarray: The mu-law decoded waveform. + """ + if mu < 1: + raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...') + + mu = mu - 1 + if quantized: # undo the quantization + y = y * 2 / mu - 1 + x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1) + return x + + +def _randint(high: int) -> int: + """Generate one random integer in range [0 high) + + This is a helper function for random data augmentaiton + """ + return int(np.random.randint(0, high=high)) + + +def depth_augment(y: np.ndarray, + choices: List=['int8', 'int16'], + probs: List[float]=[0.5, 0.5]) -> np.ndarray: + """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16']. + probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5]. + + Returns: + np.ndarray: The augmented waveform. + """ + assert len(probs) == len( + choices + ), 'number of choices {} must be equal to size of probs {}'.format( + len(choices), len(probs)) + depth = np.random.choice(choices, p=probs) + src_depth = y.dtype + y1 = depth_convert(y, depth) + y2 = depth_convert(y1, src_depth) + + return y2 + + +def adaptive_spect_augment(spect: np.ndarray, + tempo_axis: int=0, + level: float=0.1) -> np.ndarray: + """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation. + + Args: + spect (np.ndarray): Input spectrogram. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + level (float, optional): The level factor of masking. Defaults to 0.1. + + Returns: + np.ndarray: The augmented spectrogram. + """ + assert spect.ndim == 2., 'only supports 2d tensor or numpy array' + if tempo_axis == 0: + nt, nf = spect.shape + else: + nf, nt = spect.shape + + time_mask_width = int(nt * level * 0.5) + freq_mask_width = int(nf * level * 0.5) + + num_time_mask = int(10 * level) + num_freq_mask = int(10 * level) + + if tempo_axis == 0: + for _ in range(num_time_mask): + start = _randint(nt - time_mask_width) + spect[start:start + time_mask_width, :] = 0 + for _ in range(num_freq_mask): + start = _randint(nf - freq_mask_width) + spect[:, start:start + freq_mask_width] = 0 + else: + for _ in range(num_time_mask): + start = _randint(nt - time_mask_width) + spect[:, start:start + time_mask_width] = 0 + for _ in range(num_freq_mask): + start = _randint(nf - freq_mask_width) + spect[start:start + freq_mask_width, :] = 0 + + return spect + + +def spect_augment(spect: np.ndarray, + tempo_axis: int=0, + max_time_mask: int=3, + max_freq_mask: int=3, + max_time_mask_width: int=30, + max_freq_mask_width: int=20) -> np.ndarray: + """Do spectrogram augmentation in both time and freq axis. + + Args: + spect (np.ndarray): Input spectrogram. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + max_time_mask (int, optional): Maximum number of time masking. Defaults to 3. + max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3. + max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30. + max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20. + + Returns: + np.ndarray: The augmented spectrogram. + """ + assert spect.ndim == 2., 'only supports 2d tensor or numpy array' + if tempo_axis == 0: + nt, nf = spect.shape + else: + nf, nt = spect.shape + + num_time_mask = _randint(max_time_mask) + num_freq_mask = _randint(max_freq_mask) + + time_mask_width = _randint(max_time_mask_width) + freq_mask_width = _randint(max_freq_mask_width) + + if tempo_axis == 0: + for _ in range(num_time_mask): + start = _randint(nt - time_mask_width) + spect[start:start + time_mask_width, :] = 0 + for _ in range(num_freq_mask): + start = _randint(nf - freq_mask_width) + spect[:, start:start + freq_mask_width] = 0 + else: + for _ in range(num_time_mask): + start = _randint(nt - time_mask_width) + spect[:, start:start + time_mask_width] = 0 + for _ in range(num_freq_mask): + start = _randint(nf - freq_mask_width) + spect[start:start + freq_mask_width, :] = 0 + + return spect + + +def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray: + """ Random cropping on a input waveform. + + Args: + y (np.ndarray): Input waveform array in 1D. + crop_len (int): Length of waveform to crop. + + Returns: + np.ndarray: The cropped waveform. + """ + if y.ndim != 1: + 'only accept 1d tensor or numpy array' + n = len(y) + idx = _randint(n - crop_len) + return y[idx:idx + crop_len] + + +def random_crop2d(s: np.ndarray, crop_len: int, + tempo_axis: int=0) -> np.ndarray: + """ Random cropping on a spectrogram. + + Args: + s (np.ndarray): Input spectrogram in 2D. + crop_len (int): Length of spectrogram to crop. + tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0. + + Returns: + np.ndarray: The cropped spectrogram. + """ + if tempo_axis >= s.ndim: + raise ParameterError('axis out of range') + + n = s.shape[tempo_axis] + idx = _randint(high=n - crop_len) + sli = [slice(None) for i in range(s.ndim)] + sli[tempo_axis] = slice(idx, idx + crop_len) + out = s[tuple(sli)] + return out diff --git a/audio/paddleaudio/datasets/__init__.py b/audio/paddleaudio/datasets/__init__.py new file mode 100644 index 000000000..f95fad305 --- /dev/null +++ b/audio/paddleaudio/datasets/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .esc50 import ESC50 +from .gtzan import GTZAN +from .hey_snips import HeySnips +from .rirs_noises import OpenRIRNoise +from .tess import TESS +from .urban_sound import UrbanSound8K +from .voxceleb import VoxCeleb diff --git a/audio/paddleaudio/datasets/dataset.py b/audio/paddleaudio/datasets/dataset.py new file mode 100644 index 000000000..f1dfc1ea3 --- /dev/null +++ b/audio/paddleaudio/datasets/dataset.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +import numpy as np +import paddle + +from ..backends.soundfile_backend import soundfile_load as load_audio +from ..compliance.kaldi import fbank as kaldi_fbank +from ..compliance.kaldi import mfcc as kaldi_mfcc +from ..compliance.librosa import melspectrogram +from ..compliance.librosa import mfcc + +feat_funcs = { + 'raw': None, + 'melspectrogram': melspectrogram, + 'mfcc': mfcc, + 'kaldi_fbank': kaldi_fbank, + 'kaldi_mfcc': kaldi_mfcc, +} + + +class AudioClassificationDataset(paddle.io.Dataset): + """ + Base class of audio classification dataset. + """ + + def __init__(self, + files: List[str], + labels: List[int], + feat_type: str='raw', + sample_rate: int=None, + **kwargs): + """ + Ags: + files (:obj:`List[str]`): A list of absolute path of audio files. + labels (:obj:`List[int]`): Labels of audio files. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + super(AudioClassificationDataset, self).__init__() + + if feat_type not in feat_funcs.keys(): + raise RuntimeError( + f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}" + ) + + self.files = files + self.labels = labels + + self.feat_type = feat_type + self.sample_rate = sample_rate + self.feat_config = kwargs # Pass keyword arguments to customize feature config + + def _get_data(self, input_file: str): + raise NotImplementedError + + def _convert_to_record(self, idx): + file, label = self.files[idx], self.labels[idx] + + if self.sample_rate is None: + waveform, sample_rate = load_audio(file) + else: + waveform, sample_rate = load_audio(file, sr=self.sample_rate) + + feat_func = feat_funcs[self.feat_type] + + record = {} + if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: + waveform = paddle.to_tensor(waveform).unsqueeze(0) # (C, T) + record['feat'] = feat_func( + waveform=waveform, sr=self.sample_rate, **self.feat_config) + else: + record['feat'] = feat_func( + waveform, sample_rate, + **self.feat_config) if feat_func else waveform + record['label'] = label + return record + + def __getitem__(self, idx): + record = self._convert_to_record(idx) + if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']: + return self.keys[idx], record['feat'], record['label'] + else: + return np.array(record['feat']).transpose(), np.array( + record['label'], dtype=np.int64) + + def __len__(self): + return len(self.files) diff --git a/audio/paddleaudio/datasets/esc50.py b/audio/paddleaudio/datasets/esc50.py new file mode 100644 index 000000000..e7477d40e --- /dev/null +++ b/audio/paddleaudio/datasets/esc50.py @@ -0,0 +1,152 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import os +from typing import List +from typing import Tuple + +from ..utils.download import download_and_decompress +from ..utils.env import DATA_HOME +from .dataset import AudioClassificationDataset + +__all__ = ['ESC50'] + + +class ESC50(AudioClassificationDataset): + """ + The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings + suitable for benchmarking methods of environmental sound classification. The dataset + consists of 5-second-long recordings organized into 50 semantical classes (with + 40 examples per class) + + Reference: + ESC: Dataset for Environmental Sound Classification + http://dx.doi.org/10.1145/2733373.2806390 + """ + + archieves = [ + { + 'url': + 'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip', + 'md5': '7771e4b9d86d0945acce719c7a59305a', + }, + ] + label_list = [ + # Animals + 'Dog', + 'Rooster', + 'Pig', + 'Cow', + 'Frog', + 'Cat', + 'Hen', + 'Insects (flying)', + 'Sheep', + 'Crow', + # Natural soundscapes & water sounds + 'Rain', + 'Sea waves', + 'Crackling fire', + 'Crickets', + 'Chirping birds', + 'Water drops', + 'Wind', + 'Pouring water', + 'Toilet flush', + 'Thunderstorm', + # Human, non-speech sounds + 'Crying baby', + 'Sneezing', + 'Clapping', + 'Breathing', + 'Coughing', + 'Footsteps', + 'Laughing', + 'Brushing teeth', + 'Snoring', + 'Drinking, sipping', + # Interior/domestic sounds + 'Door knock', + 'Mouse click', + 'Keyboard typing', + 'Door, wood creaks', + 'Can opening', + 'Washing machine', + 'Vacuum cleaner', + 'Clock alarm', + 'Clock tick', + 'Glass breaking', + # Exterior/urban noises + 'Helicopter', + 'Chainsaw', + 'Siren', + 'Car horn', + 'Engine', + 'Train', + 'Church bells', + 'Airplane', + 'Fireworks', + 'Hand saw', + ] + meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv') + meta_info = collections.namedtuple( + 'META_INFO', + ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take')) + audio_path = os.path.join('ESC-50-master', 'audio') + + def __init__(self, + mode: str='train', + split: int=1, + feat_type: str='raw', + **kwargs): + """ + Ags: + mode (:obj:`str`, `optional`, defaults to `train`): + It identifies the dataset mode (train or dev). + split (:obj:`int`, `optional`, defaults to 1): + It specify the fold of dev dataset. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + files, labels = self._get_data(mode, split) + super(ESC50, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) + + def _get_meta_info(self) -> List[collections.namedtuple]: + ret = [] + with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: + for line in rf.readlines()[1:]: + ret.append(self.meta_info(*line.strip().split(','))) + return ret + + def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ + not os.path.isfile(os.path.join(DATA_HOME, self.meta)): + download_and_decompress(self.archieves, DATA_HOME) + + meta_info = self._get_meta_info() + + files = [] + labels = [] + for sample in meta_info: + filename, fold, target, _, _, _, _ = sample + if mode == 'train' and int(fold) != split: + files.append(os.path.join(DATA_HOME, self.audio_path, filename)) + labels.append(int(target)) + + if mode != 'train' and int(fold) == split: + files.append(os.path.join(DATA_HOME, self.audio_path, filename)) + labels.append(int(target)) + + return files, labels diff --git a/audio/paddleaudio/datasets/gtzan.py b/audio/paddleaudio/datasets/gtzan.py new file mode 100644 index 000000000..cfea6f37e --- /dev/null +++ b/audio/paddleaudio/datasets/gtzan.py @@ -0,0 +1,115 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import os +import random +from typing import List +from typing import Tuple + +from ..utils.download import download_and_decompress +from ..utils.env import DATA_HOME +from .dataset import AudioClassificationDataset + +__all__ = ['GTZAN'] + + +class GTZAN(AudioClassificationDataset): + """ + The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres, + each represented by 100 tracks. The dataset is the most-used public dataset for evaluation + in machine listening research for music genre recognition (MGR). + + Reference: + Musical genre classification of audio signals + https://ieeexplore.ieee.org/document/1021072/ + """ + + archieves = [ + { + 'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz', + 'md5': '5b3d6dddb579ab49814ab86dba69e7c7', + }, + ] + label_list = [ + 'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', + 'pop', 'reggae', 'rock' + ] + meta = os.path.join('genres', 'input.mf') + meta_info = collections.namedtuple('META_INFO', ('file_path', 'label')) + audio_path = 'genres' + + def __init__(self, + mode='train', + seed=0, + n_folds=5, + split=1, + feat_type='raw', + **kwargs): + """ + Ags: + mode (:obj:`str`, `optional`, defaults to `train`): + It identifies the dataset mode (train or dev). + seed (:obj:`int`, `optional`, defaults to 0): + Set the random seed to shuffle samples. + n_folds (:obj:`int`, `optional`, defaults to 5): + Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. + split (:obj:`int`, `optional`, defaults to 1): + It specify the fold of dev dataset. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' + files, labels = self._get_data(mode, seed, n_folds, split) + super(GTZAN, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) + + def _get_meta_info(self) -> List[collections.namedtuple]: + ret = [] + with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: + for line in rf.readlines(): + ret.append(self.meta_info(*line.strip().split('\t'))) + return ret + + def _get_data(self, mode, seed, n_folds, + split) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ + not os.path.isfile(os.path.join(DATA_HOME, self.meta)): + download_and_decompress(self.archieves, DATA_HOME) + + meta_info = self._get_meta_info() + random.seed(seed) # shuffle samples to split data + random.shuffle( + meta_info + ) # make sure using the same seed to create train and dev dataset + + files = [] + labels = [] + n_samples_per_fold = len(meta_info) // n_folds + for idx, sample in enumerate(meta_info): + file_path, label = sample + filename = os.path.basename(file_path) + target = self.label_list.index(label) + fold = idx // n_samples_per_fold + 1 + + if mode == 'train' and int(fold) != split: + files.append( + os.path.join(DATA_HOME, self.audio_path, label, filename)) + labels.append(target) + + if mode != 'train' and int(fold) == split: + files.append( + os.path.join(DATA_HOME, self.audio_path, label, filename)) + labels.append(target) + + return files, labels diff --git a/audio/paddleaudio/datasets/hey_snips.py b/audio/paddleaudio/datasets/hey_snips.py new file mode 100644 index 000000000..7a67b843b --- /dev/null +++ b/audio/paddleaudio/datasets/hey_snips.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import json +import os +from typing import List +from typing import Tuple + +from .dataset import AudioClassificationDataset + +__all__ = ['HeySnips'] + + +class HeySnips(AudioClassificationDataset): + meta_info = collections.namedtuple('META_INFO', + ('key', 'label', 'duration', 'wav')) + + def __init__(self, + data_dir: os.PathLike, + mode: str='train', + feat_type: str='kaldi_fbank', + sample_rate: int=16000, + **kwargs): + self.data_dir = data_dir + files, labels = self._get_data(mode) + super(HeySnips, self).__init__( + files=files, + labels=labels, + feat_type=feat_type, + sample_rate=sample_rate, + **kwargs) + + def _get_meta_info(self, mode) -> List[collections.namedtuple]: + ret = [] + with open(os.path.join(self.data_dir, '{}.json'.format(mode)), + 'r') as f: + data = json.load(f) + for item in data: + sample = collections.OrderedDict() + if item['duration'] > 0: + sample['key'] = item['id'] + sample['label'] = 0 if item['is_hotword'] == 1 else -1 + sample['duration'] = item['duration'] + sample['wav'] = os.path.join(self.data_dir, + item['audio_file_path']) + ret.append(self.meta_info(*sample.values())) + return ret + + def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: + meta_info = self._get_meta_info(mode) + + files = [] + labels = [] + self.keys = [] + self.durations = [] + for sample in meta_info: + key, target, duration, wav = sample + files.append(wav) + labels.append(int(target)) + self.keys.append(key) + self.durations.append(float(duration)) + + return files, labels diff --git a/audio/paddleaudio/datasets/rirs_noises.py b/audio/paddleaudio/datasets/rirs_noises.py new file mode 100644 index 000000000..74418daa2 --- /dev/null +++ b/audio/paddleaudio/datasets/rirs_noises.py @@ -0,0 +1,201 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import csv +import os +import random +from typing import List + +from paddle.io import Dataset +from tqdm import tqdm + +from ..backends.soundfile_backend import soundfile_load as load_audio +from ..backends.soundfile_backend import soundfile_save as save_wav +from ..utils import DATA_HOME +from ..utils.download import download_and_decompress +from .dataset import feat_funcs + +__all__ = ['OpenRIRNoise'] + + +class OpenRIRNoise(Dataset): + archieves = [ + { + 'url': 'http://www.openslr.org/resources/28/rirs_noises.zip', + 'md5': 'e6f48e257286e05de56413b4779d8ffb', + }, + ] + + sample_rate = 16000 + meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav')) + base_path = os.path.join(DATA_HOME, 'open_rir_noise') + wav_path = os.path.join(base_path, 'RIRS_NOISES') + csv_path = os.path.join(base_path, 'csv') + subsets = ['rir', 'noise'] + + def __init__(self, + subset: str='rir', + feat_type: str='raw', + target_dir=None, + random_chunk: bool=True, + chunk_duration: float=3.0, + seed: int=0, + **kwargs): + + assert subset in self.subsets, \ + 'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset) + + self.subset = subset + self.feat_type = feat_type + self.feat_config = kwargs + self.random_chunk = random_chunk + self.chunk_duration = chunk_duration + + OpenRIRNoise.csv_path = os.path.join( + target_dir, "open_rir_noise", + "csv") if target_dir else self.csv_path + self._data = self._get_data() + super(OpenRIRNoise, self).__init__() + + # Set up a seed to reproduce training or predicting result. + # random.seed(seed) + + def _get_data(self): + # Download audio files. + print(f"rirs noises base path: {self.base_path}") + if not os.path.isdir(self.base_path): + download_and_decompress( + self.archieves, self.base_path, decompress=True) + else: + print( + f"{self.base_path} already exists, we will not download and decompress again" + ) + + # Data preparation. + print(f"prepare the csv to {self.csv_path}") + if not os.path.isdir(self.csv_path): + os.makedirs(self.csv_path) + self.prepare_data() + + data = [] + with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf: + for line in rf.readlines()[1:]: + audio_id, duration, wav = line.strip().split(',') + data.append(self.meta_info(audio_id, float(duration), wav)) + + random.shuffle(data) + return data + + def _convert_to_record(self, idx: int): + sample = self._data[idx] + + record = {} + # To show all fields in a namedtuple: `type(sample)._fields` + for field in type(sample)._fields: + record[field] = getattr(sample, field) + + waveform, sr = load_audio(record['wav']) + + assert self.feat_type in feat_funcs.keys(), \ + f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}" + feat_func = feat_funcs[self.feat_type] + feat = feat_func( + waveform, sr=sr, **self.feat_config) if feat_func else waveform + + record.update({'feat': feat}) + return record + + @staticmethod + def _get_chunks(seg_dur, audio_id, audio_duration): + num_chunks = int(audio_duration / seg_dur) # all in milliseconds + + chunk_lst = [ + audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) + for i in range(num_chunks) + ] + return chunk_lst + + def _get_audio_info(self, wav_file: str, + split_chunks: bool) -> List[List[str]]: + waveform, sr = load_audio(wav_file) + audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0] + audio_duration = waveform.shape[0] / sr + + ret = [] + if split_chunks and audio_duration > self.chunk_duration: # Split into pieces of self.chunk_duration seconds. + uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id, + audio_duration) + + for idx, chunk in enumerate(uniq_chunks_list): + s, e = chunk.split("_")[-2:] # Timestamps of start and end + start_sample = int(float(s) * sr) + end_sample = int(float(e) * sr) + new_wav_file = os.path.join(self.base_path, + audio_id + f'_chunk_{idx+1:02}.wav') + save_wav(waveform[start_sample:end_sample], sr, new_wav_file) + # id, duration, new_wav + ret.append([chunk, self.chunk_duration, new_wav_file]) + else: # Keep whole audio. + ret.append([audio_id, audio_duration, wav_file]) + return ret + + def generate_csv(self, + wav_files: List[str], + output_file: str, + split_chunks: bool=True): + print(f'Generating csv: {output_file}') + header = ["id", "duration", "wav"] + + infos = list( + tqdm( + map(self._get_audio_info, wav_files, [split_chunks] * len( + wav_files)), + total=len(wav_files))) + + csv_lines = [] + for info in infos: + csv_lines.extend(info) + + with open(output_file, mode="w") as csv_f: + csv_writer = csv.writer( + csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) + csv_writer.writerow(header) + for line in csv_lines: + csv_writer.writerow(line) + + def prepare_data(self): + rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises", + "rir_list") + rir_files = [] + with open(rir_list, 'r') as f: + for line in f.readlines(): + rir_file = line.strip().split(' ')[-1] + rir_files.append(os.path.join(self.base_path, rir_file)) + + noise_list = os.path.join(self.wav_path, "pointsource_noises", + "noise_list") + noise_files = [] + with open(noise_list, 'r') as f: + for line in f.readlines(): + noise_file = line.strip().split(' ')[-1] + noise_files.append(os.path.join(self.base_path, noise_file)) + + self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv')) + self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv')) + + def __getitem__(self, idx): + return self._convert_to_record(idx) + + def __len__(self): + return len(self._data) diff --git a/audio/paddleaudio/datasets/tess.py b/audio/paddleaudio/datasets/tess.py new file mode 100644 index 000000000..8faab9c39 --- /dev/null +++ b/audio/paddleaudio/datasets/tess.py @@ -0,0 +1,126 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import os +import random +from typing import List +from typing import Tuple + +from ..utils.download import download_and_decompress +from ..utils.env import DATA_HOME +from .dataset import AudioClassificationDataset + +__all__ = ['TESS'] + + +class TESS(AudioClassificationDataset): + """ + TESS is a set of 200 target words were spoken in the carrier phrase + "Say the word _____' by two actresses (aged 26 and 64 years) and + recordings were made of the set portraying each of seven emotions(anger, + disgust, fear, happiness, pleasant surprise, sadness, and neutral). + There are 2800 stimuli in total. + + Reference: + Toronto emotional speech set (TESS) + https://doi.org/10.5683/SP2/E8H2MF + """ + + archieves = [ + { + 'url': + 'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip', + 'md5': + '1465311b24d1de704c4c63e4ccc470c7', + }, + ] + label_list = [ + 'angry', + 'disgust', + 'fear', + 'happy', + 'neutral', + 'ps', # pleasant surprise + 'sad', + ] + meta_info = collections.namedtuple('META_INFO', + ('speaker', 'word', 'emotion')) + audio_path = 'TESS_Toronto_emotional_speech_set' + + def __init__(self, + mode='train', + seed=0, + n_folds=5, + split=1, + feat_type='raw', + **kwargs): + """ + Ags: + mode (:obj:`str`, `optional`, defaults to `train`): + It identifies the dataset mode (train or dev). + seed (:obj:`int`, `optional`, defaults to 0): + Set the random seed to shuffle samples. + n_folds (:obj:`int`, `optional`, defaults to 5): + Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. + split (:obj:`int`, `optional`, defaults to 1): + It specify the fold of dev dataset. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' + files, labels = self._get_data(mode, seed, n_folds, split) + super(TESS, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) + + def _get_meta_info(self, files) -> List[collections.namedtuple]: + ret = [] + for file in files: + basename_without_extend = os.path.basename(file)[:-4] + ret.append(self.meta_info(*basename_without_extend.split('_'))) + return ret + + def _get_data(self, mode, seed, n_folds, + split) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)): + download_and_decompress(self.archieves, DATA_HOME) + + wav_files = [] + for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)): + for file in files: + if file.endswith('.wav'): + wav_files.append(os.path.join(root, file)) + + random.seed(seed) # shuffle samples to split data + random.shuffle( + wav_files + ) # make sure using the same seed to create train and dev dataset + meta_info = self._get_meta_info(wav_files) + + files = [] + labels = [] + n_samples_per_fold = len(meta_info) // n_folds + for idx, sample in enumerate(meta_info): + _, _, emotion = sample + target = self.label_list.index(emotion) + fold = idx // n_samples_per_fold + 1 + + if mode == 'train' and int(fold) != split: + files.append(wav_files[idx]) + labels.append(target) + + if mode != 'train' and int(fold) == split: + files.append(wav_files[idx]) + labels.append(target) + + return files, labels diff --git a/audio/paddleaudio/datasets/urban_sound.py b/audio/paddleaudio/datasets/urban_sound.py new file mode 100644 index 000000000..d97c4d1dc --- /dev/null +++ b/audio/paddleaudio/datasets/urban_sound.py @@ -0,0 +1,104 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import os +from typing import List +from typing import Tuple + +from ..utils.download import download_and_decompress +from ..utils.env import DATA_HOME +from .dataset import AudioClassificationDataset + +__all__ = ['UrbanSound8K'] + + +class UrbanSound8K(AudioClassificationDataset): + """ + UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban + sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark, + drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The + classes are drawn from the urban sound taxonomy. + + Reference: + A Dataset and Taxonomy for Urban Sound Research + https://dl.acm.org/doi/10.1145/2647868.2655045 + """ + + archieves = [ + { + 'url': + 'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz', + 'md5': '9aa69802bbf37fb986f71ec1483a196e', + }, + ] + label_list = [ + "air_conditioner", "car_horn", "children_playing", "dog_bark", + "drilling", "engine_idling", "gun_shot", "jackhammer", "siren", + "street_music" + ] + meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv') + meta_info = collections.namedtuple( + 'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold', + 'class_id', 'label')) + audio_path = os.path.join('UrbanSound8K', 'audio') + + def __init__(self, + mode: str='train', + split: int=1, + feat_type: str='raw', + **kwargs): + files, labels = self._get_data(mode, split) + super(UrbanSound8K, self).__init__( + files=files, labels=labels, feat_type=feat_type, **kwargs) + """ + Ags: + mode (:obj:`str`, `optional`, defaults to `train`): + It identifies the dataset mode (train or dev). + split (:obj:`int`, `optional`, defaults to 1): + It specify the fold of dev dataset. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + + def _get_meta_info(self): + ret = [] + with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: + for line in rf.readlines()[1:]: + ret.append(self.meta_info(*line.strip().split(','))) + return ret + + def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ + not os.path.isfile(os.path.join(DATA_HOME, self.meta)): + download_and_decompress(self.archieves, DATA_HOME) + + meta_info = self._get_meta_info() + + files = [] + labels = [] + for sample in meta_info: + filename, _, _, _, _, fold, target, _ = sample + if mode == 'train' and int(fold) != split: + files.append( + os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', + filename)) + labels.append(int(target)) + + if mode != 'train' and int(fold) == split: + files.append( + os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', + filename)) + labels.append(int(target)) + + return files, labels diff --git a/audio/paddleaudio/datasets/voxceleb.py b/audio/paddleaudio/datasets/voxceleb.py new file mode 100644 index 000000000..b7160b24c --- /dev/null +++ b/audio/paddleaudio/datasets/voxceleb.py @@ -0,0 +1,356 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import csv +import glob +import os +import random +from multiprocessing import cpu_count +from typing import List + +from paddle.io import Dataset +from pathos.multiprocessing import Pool +from tqdm import tqdm + +from ..backends.soundfile_backend import soundfile_load as load_audio +from ..utils import DATA_HOME +from ..utils import decompress +from ..utils.download import download_and_decompress +from .dataset import feat_funcs + +__all__ = ['VoxCeleb'] + + +class VoxCeleb(Dataset): + source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/' + archieves_audio_dev = [ + { + 'url': source_url + 'vox1_dev_wav_partaa', + 'md5': 'e395d020928bc15670b570a21695ed96', + }, + { + 'url': source_url + 'vox1_dev_wav_partab', + 'md5': 'bbfaaccefab65d82b21903e81a8a8020', + }, + { + 'url': source_url + 'vox1_dev_wav_partac', + 'md5': '017d579a2a96a077f40042ec33e51512', + }, + { + 'url': source_url + 'vox1_dev_wav_partad', + 'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19', + }, + ] + archieves_audio_test = [ + { + 'url': source_url + 'vox1_test_wav.zip', + 'md5': '185fdc63c3c739954633d50379a3d102', + }, + ] + archieves_meta = [ + { + 'url': + 'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt', + 'md5': + 'b73110731c9223c1461fe49cb48dddfc', + }, + ] + + num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41 + sample_rate = 16000 + meta_info = collections.namedtuple( + 'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id')) + base_path = os.path.join(DATA_HOME, 'vox1') + wav_path = os.path.join(base_path, 'wav') + meta_path = os.path.join(base_path, 'meta') + veri_test_file = os.path.join(meta_path, 'veri_test2.txt') + csv_path = os.path.join(base_path, 'csv') + subsets = ['train', 'dev', 'enroll', 'test'] + + def __init__( + self, + subset: str='train', + feat_type: str='raw', + random_chunk: bool=True, + chunk_duration: float=3.0, # seconds + split_ratio: float=0.9, # train split ratio + seed: int=0, + target_dir: str=None, + vox2_base_path=None, + **kwargs): + """VoxCeleb data prepare and get the specific dataset audio info + + Args: + subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'. + feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'. + random_chunk (bool, optional): random select a duration from audio. Defaults to True. + chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0. + target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None. + vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None. + """ + assert subset in self.subsets, \ + 'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset) + + self.subset = subset + self.spk_id2label = {} + self.feat_type = feat_type + self.feat_config = kwargs + self.random_chunk = random_chunk + self.chunk_duration = chunk_duration + self.split_ratio = split_ratio + self.target_dir = target_dir if target_dir else VoxCeleb.base_path + self.vox2_base_path = vox2_base_path + + # if we set the target dir, we will change the vox data info data from base path to target dir + VoxCeleb.csv_path = os.path.join( + target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path + VoxCeleb.meta_path = os.path.join( + target_dir, "voxceleb", + 'meta') if target_dir else VoxCeleb.meta_path + VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path, + 'veri_test2.txt') + # self._data = self._get_data()[:1000] # KP: Small dataset test. + self._data = self._get_data() + super(VoxCeleb, self).__init__() + + # Set up a seed to reproduce training or predicting result. + # random.seed(seed) + + def _get_data(self): + # Download audio files. + # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir + # so, we check the vox1/wav dir status + print(f"wav base path: {self.wav_path}") + if not os.path.isdir(self.wav_path): + print("start to download the voxceleb1 dataset") + download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip + self.archieves_audio_dev, + self.base_path, + decompress=False) + download_and_decompress( # download the vox1_test_wav.zip and unzip + self.archieves_audio_test, + self.base_path, + decompress=True) + + # Download all parts and concatenate the files into one zip file. + dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip') + print(f'Concatenating all parts to: {dev_zipfile}') + os.system( + f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}' + ) + + # Extract all audio files of dev and test set. + decompress(dev_zipfile, self.base_path) + + # Download meta files. + if not os.path.isdir(self.meta_path): + print("prepare the meta data") + download_and_decompress( + self.archieves_meta, self.meta_path, decompress=False) + + # Data preparation. + if not os.path.isdir(self.csv_path): + os.makedirs(self.csv_path) + self.prepare_data() + + data = [] + print( + f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}" + ) + with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf: + for line in rf.readlines()[1:]: + audio_id, duration, wav, start, stop, spk_id = line.strip( + ).split(',') + data.append( + self.meta_info(audio_id, + float(duration), wav, + int(start), int(stop), spk_id)) + + with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f: + for line in f.readlines(): + spk_id, label = line.strip().split(' ') + self.spk_id2label[spk_id] = int(label) + + return data + + def _convert_to_record(self, idx: int): + sample = self._data[idx] + + record = {} + # To show all fields in a namedtuple: `type(sample)._fields` + for field in type(sample)._fields: + record[field] = getattr(sample, field) + + waveform, sr = load_audio(record['wav']) + + # random select a chunk audio samples from the audio + if self.random_chunk: + num_wav_samples = waveform.shape[0] + num_chunk_samples = int(self.chunk_duration * sr) + start = random.randint(0, num_wav_samples - num_chunk_samples - 1) + stop = start + num_chunk_samples + else: + start = record['start'] + stop = record['stop'] + + waveform = waveform[start:stop] + + assert self.feat_type in feat_funcs.keys(), \ + f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}" + feat_func = feat_funcs[self.feat_type] + feat = feat_func( + waveform, sr=sr, **self.feat_config) if feat_func else waveform + + record.update({'feat': feat}) + if self.subset in ['train', + 'dev']: # Labels are available in train and dev. + record.update({'label': self.spk_id2label[record['spk_id']]}) + + return record + + @staticmethod + def _get_chunks(seg_dur, audio_id, audio_duration): + num_chunks = int(audio_duration / seg_dur) # all in milliseconds + + chunk_lst = [ + audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) + for i in range(num_chunks) + ] + return chunk_lst + + def _get_audio_info(self, wav_file: str, + split_chunks: bool) -> List[List[str]]: + waveform, sr = load_audio(wav_file) + spk_id, sess_id, utt_id = wav_file.split("/")[-3:] + audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]]) + audio_duration = waveform.shape[0] / sr + + ret = [] + if split_chunks: # Split into pieces of self.chunk_duration seconds. + uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id, + audio_duration) + + for chunk in uniq_chunks_list: + s, e = chunk.split("_")[-2:] # Timestamps of start and end + start_sample = int(float(s) * sr) + end_sample = int(float(e) * sr) + # id, duration, wav, start, stop, spk_id + ret.append([ + chunk, audio_duration, wav_file, start_sample, end_sample, + spk_id + ]) + else: # Keep whole audio. + ret.append([ + audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id + ]) + return ret + + def generate_csv(self, + wav_files: List[str], + output_file: str, + split_chunks: bool=True): + print(f'Generating csv: {output_file}') + header = ["id", "duration", "wav", "start", "stop", "spk_id"] + # Note: this may occurs c++ execption, but the program will execute fine + # so we can ignore the execption + with Pool(cpu_count()) as p: + infos = list( + tqdm( + p.imap(lambda x: self._get_audio_info(x, split_chunks), + wav_files), + total=len(wav_files))) + + csv_lines = [] + for info in infos: + csv_lines.extend(info) + + with open(output_file, mode="w") as csv_f: + csv_writer = csv.writer( + csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) + csv_writer.writerow(header) + for line in csv_lines: + csv_writer.writerow(line) + + def prepare_data(self): + # Audio of speakers in veri_test_file should not be included in training set. + print("start to prepare the data csv file") + enroll_files = set() + test_files = set() + # get the enroll and test audio file path + with open(self.veri_test_file, 'r') as f: + for line in f.readlines(): + _, enrol_file, test_file = line.strip().split(' ') + enroll_files.add(os.path.join(self.wav_path, enrol_file)) + test_files.add(os.path.join(self.wav_path, test_file)) + enroll_files = sorted(enroll_files) + test_files = sorted(test_files) + + # get the enroll and test speakers + test_spks = set() + for file in (enroll_files + test_files): + spk = file.split('/wav/')[1].split('/')[0] + test_spks.add(spk) + + # get all the train and dev audios file path + audio_files = [] + speakers = set() + print("Getting file list...") + for path in [self.wav_path, self.vox2_base_path]: + # if vox2 directory is not set and vox2 is not a directory + # we will not process this directory + if not path or not os.path.exists(path): + print(f"{path} is an invalid path, please check again, " + "and we will ignore the vox2 base path") + continue + for file in glob.glob( + os.path.join(path, "**", "*.wav"), recursive=True): + spk = file.split('/wav/')[1].split('/')[0] + if spk in test_spks: + continue + speakers.add(spk) + audio_files.append(file) + + print( + f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}" + ) + # encode the train and dev speakers label to spk_id2label.txt + with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f: + for label, spk_id in enumerate( + sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2 + f.write(f'{spk_id} {label}\n') + + audio_files = sorted(audio_files) + random.shuffle(audio_files) + split_idx = int(self.split_ratio * len(audio_files)) + # split_ratio to train + train_files, dev_files = audio_files[:split_idx], audio_files[ + split_idx:] + + self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv')) + self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv')) + + self.generate_csv( + enroll_files, + os.path.join(self.csv_path, 'enroll.csv'), + split_chunks=False) + self.generate_csv( + test_files, + os.path.join(self.csv_path, 'test.csv'), + split_chunks=False) + + def __getitem__(self, idx): + return self._convert_to_record(idx) + + def __len__(self): + return len(self._data) diff --git a/audio/paddleaudio/features/__init__.py b/audio/paddleaudio/features/__init__.py new file mode 100644 index 000000000..00781397f --- /dev/null +++ b/audio/paddleaudio/features/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .layers import LogMelSpectrogram +from .layers import MelSpectrogram +from .layers import MFCC +from .layers import Spectrogram diff --git a/audio/paddleaudio/features/layers.py b/audio/paddleaudio/features/layers.py new file mode 100644 index 000000000..292363e64 --- /dev/null +++ b/audio/paddleaudio/features/layers.py @@ -0,0 +1,328 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import partial +from typing import Optional +from typing import Union + +import paddle +import paddle.nn as nn +from paddle import Tensor + +from ..functional import compute_fbank_matrix +from ..functional import create_dct +from ..functional import power_to_db +from ..functional.window import get_window + +__all__ = [ + 'Spectrogram', + 'MelSpectrogram', + 'LogMelSpectrogram', + 'MFCC', +] + + +class Spectrogram(nn.Layer): + """Compute spectrogram of given signals, typically audio waveforms. + The spectorgram is defined as the complex norm of the short-time Fourier transformation. + + Args: + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + + def __init__(self, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + power: float=2.0, + center: bool=True, + pad_mode: str='reflect', + dtype: str='float32') -> None: + super(Spectrogram, self).__init__() + + assert power > 0, 'Power of spectrogram must be > 0.' + self.power = power + + if win_length is None: + win_length = n_fft + + self.fft_window = get_window( + window, win_length, fftbins=True, dtype=dtype) + self._stft = partial( + paddle.signal.stft, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=self.fft_window, + center=center, + pad_mode=pad_mode) + self.register_buffer('fft_window', self.fft_window) + + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`. + """ + stft = self._stft(x) + spectrogram = paddle.pow(paddle.abs(stft), self.power) + return spectrogram + + +class MelSpectrogram(nn.Layer): + """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix. + + Args: + sr (int, optional): Sample rate. Defaults to 22050. + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. + norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + + def __init__(self, + sr: int=22050, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + power: float=2.0, + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=50.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + dtype: str='float32') -> None: + super(MelSpectrogram, self).__init__() + + self._spectrogram = Spectrogram( + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + power=power, + center=center, + pad_mode=pad_mode, + dtype=dtype) + self.n_mels = n_mels + self.f_min = f_min + self.f_max = f_max + self.htk = htk + self.norm = norm + if f_max is None: + f_max = sr // 2 + self.fbank_matrix = compute_fbank_matrix( + sr=sr, + n_fft=n_fft, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + dtype=dtype) # float64 for better numerical results + self.register_buffer('fbank_matrix', self.fbank_matrix) + + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`. + """ + spect_feature = self._spectrogram(x) + mel_feature = paddle.matmul(self.fbank_matrix, spect_feature) + return mel_feature + + +class LogMelSpectrogram(nn.Layer): + """Compute log-mel-spectrogram feature of given signals, typically audio waveforms. + + Args: + sr (int, optional): Sample rate. Defaults to 22050. + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. + norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. + top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + + def __init__(self, + sr: int=22050, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + power: float=2.0, + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=50.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None, + dtype: str='float32') -> None: + super(LogMelSpectrogram, self).__init__() + + self._melspectrogram = MelSpectrogram( + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + power=power, + center=center, + pad_mode=pad_mode, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + dtype=dtype) + + self.ref_value = ref_value + self.amin = amin + self.top_db = top_db + + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`. + """ + mel_feature = self._melspectrogram(x) + log_mel_feature = power_to_db( + mel_feature, + ref_value=self.ref_value, + amin=self.amin, + top_db=self.top_db) + return log_mel_feature + + +class MFCC(nn.Layer): + """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms. + + Args: + sr (int, optional): Sample rate. Defaults to 22050. + n_mfcc (int, optional): [description]. Defaults to 40. + n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512. + hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None. + win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None. + window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'. + power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0. + center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. + pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. + norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. + top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. + dtype (str, optional): Data type of input and window. Defaults to 'float32'. + """ + + def __init__(self, + sr: int=22050, + n_mfcc: int=40, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + power: float=2.0, + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=50.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None, + dtype: str=paddle.float32) -> None: + super(MFCC, self).__init__() + assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( + n_mfcc, n_mels) + self._log_melspectrogram = LogMelSpectrogram( + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + power=power, + center=center, + pad_mode=pad_mode, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + ref_value=ref_value, + amin=amin, + top_db=top_db, + dtype=dtype) + self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype) + self.register_buffer('dct_matrix', self.dct_matrix) + + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Tensor of waveforms with shape `(N, T)` + + Returns: + Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`. + """ + log_mel_feature = self._log_melspectrogram(x) + mfcc = paddle.matmul( + log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose( + (0, 2, 1)) # (B, n_mels, L) + return mfcc diff --git a/audio/paddleaudio/functional/__init__.py b/audio/paddleaudio/functional/__init__.py new file mode 100644 index 000000000..c85232df1 --- /dev/null +++ b/audio/paddleaudio/functional/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .functional import compute_fbank_matrix +from .functional import create_dct +from .functional import fft_frequencies +from .functional import hz_to_mel +from .functional import mel_frequencies +from .functional import mel_to_hz +from .functional import power_to_db diff --git a/audio/paddleaudio/functional/functional.py b/audio/paddleaudio/functional/functional.py new file mode 100644 index 000000000..19c63a9ae --- /dev/null +++ b/audio/paddleaudio/functional/functional.py @@ -0,0 +1,266 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from librosa(https://github.com/librosa/librosa) +import math +from typing import Optional +from typing import Union + +import paddle +from paddle import Tensor + +__all__ = [ + 'hz_to_mel', + 'mel_to_hz', + 'mel_frequencies', + 'fft_frequencies', + 'compute_fbank_matrix', + 'power_to_db', + 'create_dct', +] + + +def hz_to_mel(freq: Union[Tensor, float], + htk: bool=False) -> Union[Tensor, float]: + """Convert Hz to Mels. + + Args: + freq (Union[Tensor, float]): The input tensor with arbitrary shape. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + Union[Tensor, float]: Frequency in mels. + """ + + if htk: + if isinstance(freq, Tensor): + return 2595.0 * paddle.log10(1.0 + freq / 700.0) + else: + return 2595.0 * math.log10(1.0 + freq / 700.0) + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (freq - f_min) / f_sp + + # Fill in the log-scale part + + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + + if isinstance(freq, Tensor): + target = min_log_mel + paddle.log( + freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10 + mask = (freq > min_log_hz).astype(freq.dtype) + mels = target * mask + mels * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if freq >= min_log_hz: + mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep + + return mels + + +def mel_to_hz(mel: Union[float, Tensor], + htk: bool=False) -> Union[float, Tensor]: + """Convert mel bin numbers to frequencies. + + Args: + mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape. + htk (bool, optional): Use htk scaling. Defaults to False. + + Returns: + Union[float, Tensor]: Frequencies in Hz. + """ + if htk: + return 700.0 * (10.0**(mel / 2595.0) - 1.0) + + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mel + # And now the nonlinear scale + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + if isinstance(mel, Tensor): + target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel)) + mask = (mel > min_log_mel).astype(mel.dtype) + freqs = target * mask + freqs * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if mel >= min_log_mel: + freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel)) + + return freqs + + +def mel_frequencies(n_mels: int=64, + f_min: float=0.0, + f_max: float=11025.0, + htk: bool=False, + dtype: str='float32') -> Tensor: + """Compute mel frequencies. + + Args: + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. + fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0. + htk (bool, optional): Use htk scaling. Defaults to False. + dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. + + Returns: + Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`. + """ + # 'Center freqs' of mel bands - uniformly spaced between limits + min_mel = hz_to_mel(f_min, htk=htk) + max_mel = hz_to_mel(f_max, htk=htk) + mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype) + freqs = mel_to_hz(mels, htk=htk) + return freqs + + +def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor: + """Compute fourier frequencies. + + Args: + sr (int): Sample rate. + n_fft (int): Number of fft bins. + dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'. + + Returns: + Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. + """ + return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) + + +def compute_fbank_matrix(sr: int, + n_fft: int, + n_mels: int=64, + f_min: float=0.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + dtype: str='float32') -> Tensor: + """Compute fbank matrix. + + Args: + sr (int): Sample rate. + n_fft (int): Number of fft bins. + n_mels (int, optional): Number of mel bins. Defaults to 64. + f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0. + f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None. + htk (bool, optional): Use htk scaling. Defaults to False. + norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'. + dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. + + Returns: + Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. + """ + + if f_max is None: + f_max = float(sr) / 2 + + # Initialize the weights + weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype) + + # 'Center freqs' of mel bands - uniformly spaced between limits + mel_f = mel_frequencies( + n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype) + + fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f) + ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0) + #ramps = np.subtract.outer(mel_f, fftfreqs) + + for i in range(n_mels): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i + 2] / fdiff[i + 1] + + # .. then intersect them with each other and zero + weights[i] = paddle.maximum( + paddle.zeros_like(lower), paddle.minimum(lower, upper)) + + # Slaney-style mel is scaled to be approx constant energy per channel + if norm == 'slaney': + enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) + weights *= enorm.unsqueeze(1) + elif isinstance(norm, int) or isinstance(norm, float): + weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1) + + return weights + + +def power_to_db(spect: Tensor, + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None) -> Tensor: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way. + + Args: + spect (Tensor): STFT power spectrogram. + ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0. + amin (float, optional): Minimum threshold. Defaults to 1e-10. + top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None. + + Returns: + Tensor: Power spectrogram in db scale. + """ + if amin <= 0: + raise Exception("amin must be strictly positive") + + if ref_value <= 0: + raise Exception("ref_value must be strictly positive") + + ones = paddle.ones_like(spect) + log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect)) + log_spec -= 10.0 * math.log10(max(ref_value, amin)) + + if top_db is not None: + if top_db < 0: + raise Exception("top_db must be non-negative") + log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db)) + + return log_spec + + +def create_dct(n_mfcc: int, + n_mels: int, + norm: Optional[str]='ortho', + dtype: str='float32') -> Tensor: + """Create a discrete cosine transform(DCT) matrix. + + Args: + n_mfcc (int): Number of mel frequency cepstral coefficients. + n_mels (int): Number of mel filterbanks. + norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'. + dtype (str, optional): The data type of the return matrix. Defaults to 'float32'. + + Returns: + Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`. + """ + n = paddle.arange(n_mels, dtype=dtype) + k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1) + dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) * + k) # size (n_mfcc, n_mels) + if norm is None: + dct *= 2.0 + else: + assert norm == "ortho" + dct[0] *= 1.0 / math.sqrt(2.0) + dct *= math.sqrt(2.0 / float(n_mels)) + return dct.T diff --git a/audio/paddleaudio/functional/window.py b/audio/paddleaudio/functional/window.py new file mode 100644 index 000000000..c99d50462 --- /dev/null +++ b/audio/paddleaudio/functional/window.py @@ -0,0 +1,337 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +import math +from typing import List +from typing import Tuple +from typing import Union + +import paddle +from paddle import Tensor + +__all__ = [ + 'get_window', +] + + +def _cat(x: List[Tensor], data_type: str) -> Tensor: + l = [paddle.to_tensor(_, data_type) for _ in x] + return paddle.concat(l) + + +def _acosh(x: Union[Tensor, float]) -> Tensor: + if isinstance(x, float): + return math.log(x + math.sqrt(x**2 - 1)) + return paddle.log(x + paddle.sqrt(paddle.square(x) - 1)) + + +def _extend(M: int, sym: bool) -> bool: + """Extend window by 1 sample if needed for DFT-even symmetry. """ + if not sym: + return M + 1, True + else: + return M, False + + +def _len_guards(M: int) -> bool: + """Handle small or incorrect window lengths. """ + if int(M) != M or M < 0: + raise ValueError('Window length M must be a non-negative integer') + + return M <= 1 + + +def _truncate(w: Tensor, needed: bool) -> Tensor: + """Truncate window by 1 sample if needed for DFT-even symmetry. """ + if needed: + return w[:-1] + else: + return w + + +def _general_gaussian(M: int, p, sig, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a window with a generalized Gaussian shape. + This function is consistent with scipy.signal.windows.general_gaussian(). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 + w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p)) + + return _truncate(w, needs_trunc) + + +def _general_cosine(M: int, a: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a generic weighted sum of cosine terms window. + This function is consistent with scipy.signal.windows.general_cosine(). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype) + w = paddle.zeros((M, ), dtype=dtype) + for k in range(len(a)): + w += a[k] * paddle.cos(k * fac) + return _truncate(w, needs_trunc) + + +def _general_hamming(M: int, alpha: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a generalized Hamming window. + This function is consistent with scipy.signal.windows.general_hamming() + """ + return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype) + + +def _taylor(M: int, + nbar=4, + sll=30, + norm=True, + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a Taylor window. + The Taylor window taper function approximates the Dolph-Chebyshev window's + constant sidelobe level for a parameterized number of near-in sidelobes. + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + # Original text uses a negative sidelobe level parameter and then negates + # it in the calculation of B. To keep consistent with other methods we + # assume the sidelobe level parameter to be positive. + B = 10**(sll / 20) + A = _acosh(B) / math.pi + s2 = nbar**2 / (A**2 + (nbar - 0.5)**2) + ma = paddle.arange(1, nbar, dtype=dtype) + + Fm = paddle.empty((nbar - 1, ), dtype=dtype) + signs = paddle.empty_like(ma) + signs[::2] = 1 + signs[1::2] = -1 + m2 = ma * ma + for mi in range(len(ma)): + numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2 + )) + if mi == 0: + denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:]) + elif mi == len(ma) - 1: + denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) + else: + denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[ + mi] / m2[mi + 1:]) + + Fm[mi] = numer / denom + + def W(n): + return 1 + 2 * paddle.matmul( + Fm.unsqueeze(0), + paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M)) + + w = W(paddle.arange(0, M, dtype=dtype)) + + # normalize (Note that this is not described in the original text [1]) + if norm: + scale = 1.0 / W((M - 1) / 2) + w *= scale + w = w.squeeze() + return _truncate(w, needs_trunc) + + +def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Hamming window. + The Hamming window is a taper formed by using a raised cosine with + non-zero endpoints, optimized to minimize the nearest side lobe. + """ + return _general_hamming(M, 0.54, sym, dtype=dtype) + + +def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Hann window. + The Hann window is a taper formed by using a raised cosine or sine-squared + with ends that touch zero. + """ + return _general_hamming(M, 0.5, sym, dtype=dtype) + + +def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Tukey window. + The Tukey window is also known as a tapered cosine window. + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + + if alpha <= 0: + return paddle.ones((M, ), dtype=dtype) + elif alpha >= 1.0: + return hann(M, sym=sym) + + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) + width = int(alpha * (M - 1) / 2.0) + n1 = n[0:width + 1] + n2 = n[width + 1:M - width - 1] + n3 = n[M - width - 1:] + + w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1)))) + w2 = paddle.ones(n2.shape, dtype=dtype) + w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / + (M - 1)))) + w = paddle.concat([w1, w2, w3]) + + return _truncate(w, needs_trunc) + + +def _kaiser(M: int, beta: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a Kaiser window. + The Kaiser window is a taper formed by using a Bessel function. + """ + raise NotImplementedError() + + +def _gaussian(M: int, std: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a Gaussian window. + The Gaussian widows has a Gaussian shape defined by the standard deviation(std). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 + sig2 = 2 * std * std + w = paddle.exp(-n**2 / sig2) + + return _truncate(w, needs_trunc) + + +def _exponential(M: int, + center=None, + tau=1., + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute an exponential (or Poisson) window. """ + if sym and center is not None: + raise ValueError("If sym==True, center must be None.") + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + if center is None: + center = (M - 1) / 2 + + n = paddle.arange(0, M, dtype=dtype) + w = paddle.exp(-paddle.abs(n - center) / tau) + + return _truncate(w, needs_trunc) + + +def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a triangular window. + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype) + if M % 2 == 0: + w = (2 * n - 1.0) / M + w = paddle.concat([w, w[::-1]]) + else: + w = 2 * n / (M + 1.0) + w = paddle.concat([w, w[-2::-1]]) + + return _truncate(w, needs_trunc) + + +def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Bohman window. + The Bohman window is the autocorrelation of a cosine window. + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1]) + w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin( + math.pi * fac) + w = _cat([0, w, 0], dtype) + + return _truncate(w, needs_trunc) + + +def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Blackman window. + The Blackman window is a taper formed by using the first three terms of + a summation of cosines. It was designed to have close to the minimal + leakage possible. It is close to optimal, only slightly worse than a + Kaiser window. + """ + return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) + + +def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a window with a simple cosine shape. + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5)) + + return _truncate(w, needs_trunc) + + +def get_window(window: Union[str, Tuple[str, float]], + win_length: int, + fftbins: bool=True, + dtype: str='float64') -> Tensor: + """Return a window of a given length and type. + + Args: + window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. + win_length (int): Number of samples. + fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True. + dtype (str, optional): The data type of the return window. Defaults to 'float64'. + + Returns: + Tensor: The window represented as a tensor. + """ + sym = not fftbins + + args = () + if isinstance(window, tuple): + winstr = window[0] + if len(window) > 1: + args = window[1:] + elif isinstance(window, str): + if window in ['gaussian', 'exponential']: + raise ValueError("The '" + window + "' window needs one or " + "more parameters -- pass a tuple.") + else: + winstr = window + else: + raise ValueError("%s as window type is not supported." % + str(type(window))) + + try: + winfunc = eval('_' + winstr) + except KeyError as e: + raise ValueError("Unknown window type.") from e + + params = (win_length, ) + args + kwargs = {'sym': sym} + return winfunc(*params, dtype=dtype, **kwargs) diff --git a/audio/paddleaudio/io/__init__.py b/audio/paddleaudio/io/__init__.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/audio/paddleaudio/io/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/audio/paddleaudio/metric/__init__.py b/audio/paddleaudio/metric/__init__.py new file mode 100644 index 000000000..7ce6f5cff --- /dev/null +++ b/audio/paddleaudio/metric/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .eer import compute_eer +from .eer import compute_minDCF diff --git a/audio/paddleaudio/metric/eer.py b/audio/paddleaudio/metric/eer.py new file mode 100644 index 000000000..a1166d3f9 --- /dev/null +++ b/audio/paddleaudio/metric/eer.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +import numpy as np +import paddle +from sklearn.metrics import roc_curve + + +def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]: + """Compute EER and return score threshold. + + Args: + labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num + scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num + + Returns: + List[float]: eer and the specific threshold + """ + fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores) + fnr = 1 - tpr + eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))] + eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))] + return eer, eer_threshold + + +def compute_minDCF(positive_scores, + negative_scores, + c_miss=1.0, + c_fa=1.0, + p_target=0.01): + """ + This is modified from SpeechBrain + https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509 + Computes the minDCF metric normally used to evaluate speaker verification + systems. The min_DCF is the minimum of the following C_det function computed + within the defined threshold range: + + C_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target) + + where p_miss is the missing probability and p_fa is the probability of having + a false alarm. + + Args: + positive_scores (Paddle.Tensor): The scores from entries of the same class. + negative_scores (Paddle.Tensor): The scores from entries of different classes. + c_miss (float, optional): Cost assigned to a missing error (default 1.0). + c_fa (float, optional): Cost assigned to a false alarm (default 1.0). + p_target (float, optional): Prior probability of having a target (default 0.01). + + Returns: + List[float]: min dcf and the specific threshold + """ + # Computing candidate thresholds + if len(positive_scores.shape) > 1: + positive_scores = positive_scores.squeeze() + + if len(negative_scores.shape) > 1: + negative_scores = negative_scores.squeeze() + + thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores])) + thresholds = paddle.unique(thresholds) + + # Adding intermediate thresholds + interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2 + thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds])) + + # Computing False Rejection Rate (miss detection) + positive_scores = paddle.concat( + len(thresholds) * [positive_scores.unsqueeze(0)]) + pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds + p_miss = (pos_scores_threshold.sum(0) + ).astype("float32") / positive_scores.shape[1] + del positive_scores + del pos_scores_threshold + + # Computing False Acceptance Rate (false alarm) + negative_scores = paddle.concat( + len(thresholds) * [negative_scores.unsqueeze(0)]) + neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds + p_fa = (neg_scores_threshold.sum(0) + ).astype("float32") / negative_scores.shape[1] + del negative_scores + del neg_scores_threshold + + c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target) + c_min = paddle.min(c_det, axis=0) + min_index = paddle.argmin(c_det, axis=0) + return float(c_min), float(thresholds[min_index]) diff --git a/audio/paddleaudio/sox_effects/__init__.py b/audio/paddleaudio/sox_effects/__init__.py new file mode 100644 index 000000000..97043fd7b --- /dev/null +++ b/audio/paddleaudio/sox_effects/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/audio/paddleaudio/utils/__init__.py b/audio/paddleaudio/utils/__init__.py new file mode 100644 index 000000000..b10731d46 --- /dev/null +++ b/audio/paddleaudio/utils/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .download import decompress +from .download import download_and_decompress +from .download import load_state_dict_from_url +from .env import DATA_HOME +from .env import MODEL_HOME +from .env import PPAUDIO_HOME +from .env import USER_HOME +from .error import ParameterError +from .log import Logger +from .log import logger +from .time import seconds_to_hms +from .time import Timer +from .numeric import depth_convert +from .numeric import pcm16to32 diff --git a/audio/paddleaudio/utils/download.py b/audio/paddleaudio/utils/download.py new file mode 100644 index 000000000..07d5eea84 --- /dev/null +++ b/audio/paddleaudio/utils/download.py @@ -0,0 +1,64 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Dict +from typing import List + +from paddle.framework import load as load_state_dict +from paddle.utils import download + +from .log import logger + +download.logger = logger + +__all__ = [ + 'decompress', + 'download_and_decompress', + 'load_state_dict_from_url', +] + + +def decompress(file: str): + """ + Extracts all files from a compressed file. + """ + assert os.path.isfile(file), "File: {} not exists.".format(file) + download._decompress(file) + + +def download_and_decompress(archives: List[Dict[str, str]], + path: str, + decompress: bool=True): + """ + Download archieves and decompress to specific path. + """ + if not os.path.isdir(path): + os.makedirs(path) + + for archive in archives: + assert 'url' in archive and 'md5' in archive, \ + 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' + download.get_path_from_url( + archive['url'], path, archive['md5'], decompress=decompress) + + +def load_state_dict_from_url(url: str, path: str, md5: str=None): + """ + Download and load a state dict from url + """ + if not os.path.isdir(path): + os.makedirs(path) + + download.get_path_from_url(url, path, md5) + return load_state_dict(os.path.join(path, os.path.basename(url))) diff --git a/audio/paddleaudio/utils/env.py b/audio/paddleaudio/utils/env.py new file mode 100644 index 000000000..a2d14b89e --- /dev/null +++ b/audio/paddleaudio/utils/env.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +This module is used to store environmental variables in PaddleAudio. +PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the +├ default value through the PPAUDIO_HOME environment variable. +├─ MODEL_HOME --> Store model files. +└─ DATA_HOME --> Store automatically downloaded datasets. +''' +import os + +__all__ = [ + 'USER_HOME', + 'PPAUDIO_HOME', + 'MODEL_HOME', + 'DATA_HOME', +] + + +def _get_user_home(): + return os.path.expanduser('~') + + +def _get_ppaudio_home(): + if 'PPAUDIO_HOME' in os.environ: + home_path = os.environ['PPAUDIO_HOME'] + if os.path.exists(home_path): + if os.path.isdir(home_path): + return home_path + else: + raise RuntimeError( + 'The environment variable PPAUDIO_HOME {} is not a directory.'. + format(home_path)) + else: + return home_path + return os.path.join(_get_user_home(), '.paddleaudio') + + +def _get_sub_home(directory): + home = os.path.join(_get_ppaudio_home(), directory) + if not os.path.exists(home): + os.makedirs(home) + return home + + +USER_HOME = _get_user_home() +PPAUDIO_HOME = _get_ppaudio_home() +MODEL_HOME = _get_sub_home('models') +DATA_HOME = _get_sub_home('datasets') diff --git a/audio/paddleaudio/utils/error.py b/audio/paddleaudio/utils/error.py new file mode 100644 index 000000000..f39774892 --- /dev/null +++ b/audio/paddleaudio/utils/error.py @@ -0,0 +1,20 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['ParameterError'] + + +class ParameterError(Exception): + """Exception class for Parameter checking""" + pass diff --git a/audio/paddleaudio/utils/log.py b/audio/paddleaudio/utils/log.py new file mode 100644 index 000000000..5656b286a --- /dev/null +++ b/audio/paddleaudio/utils/log.py @@ -0,0 +1,139 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import functools +import logging +import threading +import time + +import colorlog + +__all__ = [ + 'Logger', + 'logger', +] + +log_config = { + 'DEBUG': { + 'level': 10, + 'color': 'purple' + }, + 'INFO': { + 'level': 20, + 'color': 'green' + }, + 'TRAIN': { + 'level': 21, + 'color': 'cyan' + }, + 'EVAL': { + 'level': 22, + 'color': 'blue' + }, + 'WARNING': { + 'level': 30, + 'color': 'yellow' + }, + 'ERROR': { + 'level': 40, + 'color': 'red' + }, + 'CRITICAL': { + 'level': 50, + 'color': 'bold_red' + } +} + + +class Logger(object): + ''' + Deafult logger in PaddleAudio + Args: + name(str) : Logger name, default is 'PaddleAudio' + ''' + + def __init__(self, name: str=None): + name = 'PaddleAudio' if not name else name + self.logger = logging.getLogger(name) + + for key, conf in log_config.items(): + logging.addLevelName(conf['level'], key) + self.__dict__[key] = functools.partial(self.__call__, conf['level']) + self.__dict__[key.lower()] = functools.partial(self.__call__, + conf['level']) + + self.format = colorlog.ColoredFormatter( + '%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s', + log_colors={key: conf['color'] + for key, conf in log_config.items()}) + + self.handler = logging.StreamHandler() + self.handler.setFormatter(self.format) + + self.logger.addHandler(self.handler) + self.logLevel = 'DEBUG' + self.logger.setLevel(logging.DEBUG) + self.logger.propagate = False + self._is_enable = True + + def disable(self): + self._is_enable = False + + def enable(self): + self._is_enable = True + + @property + def is_enable(self) -> bool: + return self._is_enable + + def __call__(self, log_level: str, msg: str): + if not self.is_enable: + return + + self.logger.log(log_level, msg) + + @contextlib.contextmanager + def use_terminator(self, terminator: str): + old_terminator = self.handler.terminator + self.handler.terminator = terminator + yield + self.handler.terminator = old_terminator + + @contextlib.contextmanager + def processing(self, msg: str, interval: float=0.1): + ''' + Continuously print a progress bar with rotating special effects. + Args: + msg(str): Message to be printed. + interval(float): Rotation interval. Default to 0.1. + ''' + end = False + + def _printer(): + index = 0 + flags = ['\\', '|', '/', '-'] + while not end: + flag = flags[index % len(flags)] + with self.use_terminator('\r'): + self.info('{}: {}'.format(msg, flag)) + time.sleep(interval) + index += 1 + + t = threading.Thread(target=_printer) + t.start() + yield + end = True + + +logger = Logger() diff --git a/audio/paddleaudio/utils/numeric.py b/audio/paddleaudio/utils/numeric.py new file mode 100644 index 000000000..9fe004846 --- /dev/null +++ b/audio/paddleaudio/utils/numeric.py @@ -0,0 +1,107 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Union + +import numpy as np + +__all__ = ["pcm16to32", "depth_convert"] + + +def pcm16to32(audio: np.ndarray) -> np.ndarray: + """pcm int16 to float32 + + Args: + audio (np.ndarray): Waveform with dtype of int16. + + Returns: + np.ndarray: Waveform with dtype of float32. + """ + if audio.dtype == np.int16: + audio = audio.astype("float32") + bits = np.iinfo(np.int16).bits + audio = audio / (2**(bits - 1)) + return audio + + +def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: + """Data type casting in a safe way, i.e., prevent overflow or underflow. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + dtype (Union[type, str]): Data type of waveform. + + Returns: + np.ndarray: `y` after safe casting. + """ + if 'float' in str(y.dtype): + return np.clip(y, np.finfo(dtype).min, + np.finfo(dtype).max).astype(dtype) + else: + return np.clip(y, np.iinfo(dtype).min, + np.iinfo(dtype).max).astype(dtype) + + +def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray: + """Convert audio array to target dtype safely. + This function convert audio waveform to a target dtype, with addition steps of + preventing overflow/underflow and preserving audio range. + + Args: + y (np.ndarray): Input waveform array in 1D or 2D. + dtype (Union[type, str]): Data type of waveform. + + Returns: + np.ndarray: `y` after safe casting. + """ + + SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64'] + if y.dtype not in SUPPORT_DTYPE: + raise ParameterError( + 'Unsupported audio dtype, ' + f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}') + + if dtype not in SUPPORT_DTYPE: + raise ParameterError( + 'Unsupported audio dtype, ' + f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}') + + if dtype == y.dtype: + return y + + if dtype == 'float64' and y.dtype == 'float32': + return _safe_cast(y, dtype) + if dtype == 'float32' and y.dtype == 'float64': + return _safe_cast(y, dtype) + + if dtype == 'int16' or dtype == 'int8': + if y.dtype in ['float64', 'float32']: + factor = np.iinfo(dtype).max + y = np.clip(y * factor, np.iinfo(dtype).min, + np.iinfo(dtype).max).astype(dtype) + y = y.astype(dtype) + else: + if dtype == 'int16' and y.dtype == 'int8': + factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS + y = y.astype('float32') * factor + y = y.astype('int16') + + else: # dtype == 'int8' and y.dtype=='int16': + y = y.astype('int32') * np.iinfo('int8').max / \ + np.iinfo('int16').max + y = y.astype('int8') + + if dtype in ['float32', 'float64']: + org_dtype = y.dtype + y = y.astype(dtype) / np.iinfo(org_dtype).max + return y diff --git a/audio/paddleaudio/utils/time.py b/audio/paddleaudio/utils/time.py new file mode 100644 index 000000000..105208f91 --- /dev/null +++ b/audio/paddleaudio/utils/time.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import time + +__all__ = [ + 'Timer', + 'seconds_to_hms', +] + + +class Timer(object): + '''Calculate runing speed and estimated time of arrival(ETA)''' + + def __init__(self, total_step: int): + self.total_step = total_step + self.last_start_step = 0 + self.current_step = 0 + self._is_running = True + + def start(self): + self.last_time = time.time() + self.start_time = time.time() + + def stop(self): + self._is_running = False + self.end_time = time.time() + + def count(self) -> int: + if not self.current_step >= self.total_step: + self.current_step += 1 + return self.current_step + + @property + def timing(self) -> float: + run_steps = self.current_step - self.last_start_step + self.last_start_step = self.current_step + time_used = time.time() - self.last_time + self.last_time = time.time() + return run_steps / time_used + + @property + def is_running(self) -> bool: + return self._is_running + + @property + def eta(self) -> str: + if not self.is_running: + return '00:00:00' + scale = self.total_step / self.current_step + remaining_time = (time.time() - self.start_time) * scale + return seconds_to_hms(remaining_time) + + +def seconds_to_hms(seconds: int) -> str: + '''Convert the number of seconds to hh:mm:ss''' + h = math.floor(seconds / 3600) + m = math.floor((seconds - h * 3600) / 60) + s = int(seconds - h * 3600 - m * 60) + hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s) + return hms_str diff --git a/audio/setup.py b/audio/setup.py new file mode 100644 index 000000000..3f64b52f2 --- /dev/null +++ b/audio/setup.py @@ -0,0 +1,99 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import glob +import os + +import setuptools +from setuptools.command.install import install +from setuptools.command.test import test + +# set the version here +VERSION = '1.0.2' + + +# Inspired by the example at https://pytest.org/latest/goodpractises.html +class TestCommand(test): + def finalize_options(self): + test.finalize_options(self) + self.test_args = [] + self.test_suite = True + + def run(self): + self.run_benchmark() + super(TestCommand, self).run() + + def run_tests(self): + # Run nose ensuring that argv simulates running nosetests directly + import nose + nose.run_exit(argv=['nosetests', '-w', 'tests']) + + def run_benchmark(self): + for benchmark_item in glob.glob('tests/benchmark/*py'): + os.system(f'pytest {benchmark_item}') + + +class InstallCommand(install): + def run(self): + install.run(self) + + +def write_version_py(filename='paddleaudio/__init__.py'): + with open(filename, "a") as f: + f.write(f"__version__ = '{VERSION}'") + + +def remove_version_py(filename='paddleaudio/__init__.py'): + with open(filename, "r") as f: + lines = f.readlines() + with open(filename, "w") as f: + for line in lines: + if "__version__" not in line: + f.write(line) + + +remove_version_py() +write_version_py() + +setuptools.setup( + name="paddleaudio", + version=VERSION, + author="", + author_email="", + description="PaddleAudio, in development", + long_description="", + long_description_content_type="text/markdown", + url="", + packages=setuptools.find_packages(include=['paddleaudio*']), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', + install_requires=[ + 'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2', + 'soundfile >= 0.9.0', 'colorlog', 'pathos == 0.2.8' + ], + extras_require={ + 'test': [ + 'nose', 'librosa==0.8.1', 'soundfile==0.10.3.post1', + 'torchaudio==0.10.2', 'pytest-benchmark' + ], + }, + cmdclass={ + 'install': InstallCommand, + 'test': TestCommand, + }, ) + +remove_version_py() diff --git a/paddlespeech/audio/backends/soundfile_backend.py b/paddlespeech/audio/backends/soundfile_backend.py index b3421e322..57e06e521 100644 --- a/paddlespeech/audio/backends/soundfile_backend.py +++ b/paddlespeech/audio/backends/soundfile_backend.py @@ -32,9 +32,9 @@ __all__ = [ 'to_mono', 'normalize', 'save', - 'soudfile_save', + 'soundfile_save', 'load', - 'load_old', + 'soundfile_load', 'info', 'to_mono' ] @@ -659,4 +659,4 @@ def info(filepath: str, format: Optional[str] = None) -> AudioMetaData: sinfo.channels, bits_per_sample=_get_bit_depth(sinfo.subtype), encoding=_get_encoding(sinfo.format, sinfo.subtype), - ) \ No newline at end of file + ) From 7261d86344fb256edd3def1ce3b620afbb03f745 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Thu, 15 Sep 2022 21:12:57 +0800 Subject: [PATCH 2/2] add test & benchmark --- audio/tests/backends/__init__.py | 13 + audio/tests/backends/common.py | 32 ++ audio/tests/backends/soundfile/__init__.py | 13 + audio/tests/backends/soundfile/base.py | 34 ++ audio/tests/backends/soundfile/common.py | 57 +++ audio/tests/backends/soundfile/info_test.py | 199 ++++++++++ audio/tests/backends/soundfile/load_test.py | 369 ++++++++++++++++++ audio/tests/backends/soundfile/save_test.py | 322 +++++++++++++++ audio/tests/backends/soundfile/test_io.py | 74 ++++ audio/tests/benchmark/README.md | 39 ++ audio/tests/benchmark/log_melspectrogram.py | 123 ++++++ audio/tests/benchmark/melspectrogram.py | 107 +++++ audio/tests/benchmark/mfcc.py | 121 ++++++ audio/tests/common_utils/__init__.py | 17 + audio/tests/common_utils/case_utils.py | 56 +++ .../tests/common_utils/parameterized_utils.py | 43 ++ audio/tests/common_utils/wav_utils.py | 102 +++++ audio/tests/features/__init__.py | 13 + audio/tests/features/base.py | 48 +++ audio/tests/features/test_kaldi.py | 81 ++++ audio/tests/features/test_librosa.py | 281 +++++++++++++ 21 files changed, 2144 insertions(+) create mode 100644 audio/tests/backends/__init__.py create mode 100644 audio/tests/backends/common.py create mode 100644 audio/tests/backends/soundfile/__init__.py create mode 100644 audio/tests/backends/soundfile/base.py create mode 100644 audio/tests/backends/soundfile/common.py create mode 100644 audio/tests/backends/soundfile/info_test.py create mode 100644 audio/tests/backends/soundfile/load_test.py create mode 100644 audio/tests/backends/soundfile/save_test.py create mode 100644 audio/tests/backends/soundfile/test_io.py create mode 100644 audio/tests/benchmark/README.md create mode 100644 audio/tests/benchmark/log_melspectrogram.py create mode 100644 audio/tests/benchmark/melspectrogram.py create mode 100644 audio/tests/benchmark/mfcc.py create mode 100644 audio/tests/common_utils/__init__.py create mode 100644 audio/tests/common_utils/case_utils.py create mode 100644 audio/tests/common_utils/parameterized_utils.py create mode 100644 audio/tests/common_utils/wav_utils.py create mode 100644 audio/tests/features/__init__.py create mode 100644 audio/tests/features/base.py create mode 100644 audio/tests/features/test_kaldi.py create mode 100644 audio/tests/features/test_librosa.py diff --git a/audio/tests/backends/__init__.py b/audio/tests/backends/__init__.py new file mode 100644 index 000000000..97043fd7b --- /dev/null +++ b/audio/tests/backends/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/audio/tests/backends/common.py b/audio/tests/backends/common.py new file mode 100644 index 000000000..79b922a91 --- /dev/null +++ b/audio/tests/backends/common.py @@ -0,0 +1,32 @@ + +def get_encoding(ext, dtype): + exts = { + "mp3", + "flac", + "vorbis", + } + encodings = { + "float32": "PCM_F", + "int32": "PCM_S", + "int16": "PCM_S", + "uint8": "PCM_U", + } + return ext.upper() if ext in exts else encodings[dtype] + + +def get_bit_depth(dtype): + bit_depths = { + "float32": 32, + "int32": 32, + "int16": 16, + "uint8": 8, + } + return bit_depths[dtype] + +def get_bits_per_sample(ext, dtype): + bits_per_samples = { + "flac": 24, + "mp3": 0, + "vorbis": 0, + } + return bits_per_samples.get(ext, get_bit_depth(dtype)) diff --git a/audio/tests/backends/soundfile/__init__.py b/audio/tests/backends/soundfile/__init__.py new file mode 100644 index 000000000..97043fd7b --- /dev/null +++ b/audio/tests/backends/soundfile/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/audio/tests/backends/soundfile/base.py b/audio/tests/backends/soundfile/base.py new file mode 100644 index 000000000..a67191887 --- /dev/null +++ b/audio/tests/backends/soundfile/base.py @@ -0,0 +1,34 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +import urllib.request + +mono_channel_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' +multi_channels_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav' + + +class BackendTest(unittest.TestCase): + def setUp(self): + self.initWavInput() + + def initWavInput(self): + self.files = [] + for url in [mono_channel_wav, multi_channels_wav]: + if not os.path.isfile(os.path.basename(url)): + urllib.request.urlretrieve(url, os.path.basename(url)) + self.files.append(os.path.basename(url)) + + def initParmas(self): + raise NotImplementedError diff --git a/audio/tests/backends/soundfile/common.py b/audio/tests/backends/soundfile/common.py new file mode 100644 index 000000000..42a07e1f0 --- /dev/null +++ b/audio/tests/backends/soundfile/common.py @@ -0,0 +1,57 @@ +import itertools +from unittest import skipIf + +from parameterized import parameterized +from paddleaudio._internal.module_utils import is_module_available + + +def name_func(func, _, params): + return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}' + + +def dtype2subtype(dtype): + return { + "float64": "DOUBLE", + "float32": "FLOAT", + "int32": "PCM_32", + "int16": "PCM_16", + "uint8": "PCM_U8", + "int8": "PCM_S8", + }[dtype] + + +def skipIfFormatNotSupported(fmt): + fmts = [] + if is_module_available("soundfile"): + import soundfile + + fmts = soundfile.available_formats() + return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile') + return skipIf(True, '"soundfile" not available.') + + +def parameterize(*params): + return parameterized.expand(list(itertools.product(*params)), name_func=name_func) + + +def fetch_wav_subtype(dtype, encoding, bits_per_sample): + subtype = { + (None, None): dtype2subtype(dtype), + (None, 8): "PCM_U8", + ("PCM_U", None): "PCM_U8", + ("PCM_U", 8): "PCM_U8", + ("PCM_S", None): "PCM_32", + ("PCM_S", 16): "PCM_16", + ("PCM_S", 32): "PCM_32", + ("PCM_F", None): "FLOAT", + ("PCM_F", 32): "FLOAT", + ("PCM_F", 64): "DOUBLE", + ("ULAW", None): "ULAW", + ("ULAW", 8): "ULAW", + ("ALAW", None): "ALAW", + ("ALAW", 8): "ALAW", + }.get((encoding, bits_per_sample)) + if subtype: + return subtype + raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).") + diff --git a/audio/tests/backends/soundfile/info_test.py b/audio/tests/backends/soundfile/info_test.py new file mode 100644 index 000000000..94f167ed9 --- /dev/null +++ b/audio/tests/backends/soundfile/info_test.py @@ -0,0 +1,199 @@ +#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/info_test.py + +import tarfile +import warnings +import unittest +from unittest.mock import patch + +import paddle +from paddleaudio._internal import module_utils as _mod_utils +from paddleaudio.backends import soundfile_backend +from tests.backends.common import get_bits_per_sample, get_encoding +from tests.common_utils import ( + get_wav_data, + nested_params, + save_wav, + TempDirMixin, +) + +from common import parameterize, skipIfFormatNotSupported + +import soundfile + + +class TestInfo(TempDirMixin, unittest.TestCase): + @parameterize( + ["float32", "int32"], + [8000, 16000], + [1, 2], + ) + def test_wav(self, dtype, sample_rate, num_channels): + """`soundfile_backend.info` can check wav file correctly""" + duration = 1 + path = self.get_temp_path("data.wav") + data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate) + save_wav(path, data, sample_rate) + info = soundfile_backend.info(path) + assert info.sample_rate == sample_rate + assert info.num_frames == sample_rate * duration + assert info.num_channels == num_channels + assert info.bits_per_sample == get_bits_per_sample("wav", dtype) + assert info.encoding == get_encoding("wav", dtype) + + @parameterize([8000, 16000], [1, 2]) + @skipIfFormatNotSupported("FLAC") + def test_flac(self, sample_rate, num_channels): + """`soundfile_backend.info` can check flac file correctly""" + duration = 1 + num_frames = sample_rate * duration + #data = torch.randn(num_frames, num_channels).numpy() + data = paddle.randn(shape=[num_frames, num_channels]).numpy() + + path = self.get_temp_path("data.flac") + soundfile.write(path, data, sample_rate) + + info = soundfile_backend.info(path) + assert info.sample_rate == sample_rate + assert info.num_frames == num_frames + assert info.num_channels == num_channels + assert info.bits_per_sample == 16 + assert info.encoding == "FLAC" + + #@parameterize([8000, 16000], [1, 2]) + #@skipIfFormatNotSupported("OGG") + #def test_ogg(self, sample_rate, num_channels): + #"""`soundfile_backend.info` can check ogg file correctly""" + #duration = 1 + #num_frames = sample_rate * duration + ##data = torch.randn(num_frames, num_channels).numpy() + #data = paddle.randn(shape=[num_frames, num_channels]).numpy() + #print(len(data)) + #path = self.get_temp_path("data.ogg") + #soundfile.write(path, data, sample_rate) + + #info = soundfile_backend.info(path) + #print(info) + #assert info.sample_rate == sample_rate + #print("info") + #print(info.num_frames) + #print("jiji") + #print(sample_rate*duration) + ##assert info.num_frames == sample_rate * duration + #assert info.num_channels == num_channels + #assert info.bits_per_sample == 0 + #assert info.encoding == "VORBIS" + + @nested_params( + [8000, 16000], + [1, 2], + [("PCM_24", 24), ("PCM_32", 32)], + ) + @skipIfFormatNotSupported("NIST") + def test_sphere(self, sample_rate, num_channels, subtype_and_bit_depth): + """`soundfile_backend.info` can check sph file correctly""" + duration = 1 + num_frames = sample_rate * duration + #data = torch.randn(num_frames, num_channels).numpy() + data = paddle.randn(shape=[num_frames, num_channels]).numpy() + path = self.get_temp_path("data.nist") + subtype, bits_per_sample = subtype_and_bit_depth + soundfile.write(path, data, sample_rate, subtype=subtype) + + info = soundfile_backend.info(path) + assert info.sample_rate == sample_rate + assert info.num_frames == sample_rate * duration + assert info.num_channels == num_channels + assert info.bits_per_sample == bits_per_sample + assert info.encoding == "PCM_S" + + def test_unknown_subtype_warning(self): + """soundfile_backend.info issues a warning when the subtype is unknown + + This will happen if a new subtype is supported in SoundFile: the _SUBTYPE_TO_BITS_PER_SAMPLE + dict should be updated. + """ + + def _mock_info_func(_): + class MockSoundFileInfo: + samplerate = 8000 + frames = 356 + channels = 2 + subtype = "UNSEEN_SUBTYPE" + format = "UNKNOWN" + + return MockSoundFileInfo() + + with patch("soundfile.info", _mock_info_func): + with warnings.catch_warnings(record=True) as w: + info = soundfile_backend.info("foo") + assert len(w) == 1 + assert "UNSEEN_SUBTYPE subtype is unknown to PaddleAudio" in str(w[-1].message) + assert info.bits_per_sample == 0 + + +class TestFileObject(TempDirMixin, unittest.TestCase): + def _test_fileobj(self, ext, subtype, bits_per_sample): + """Query audio via file-like object works""" + duration = 2 + sample_rate = 16000 + num_channels = 2 + num_frames = sample_rate * duration + path = self.get_temp_path(f"test.{ext}") + + #data = torch.randn(num_frames, num_channels).numpy() + data = paddle.randn(shape=[num_frames, num_channels]).numpy() + soundfile.write(path, data, sample_rate, subtype=subtype) + + with open(path, "rb") as fileobj: + info = soundfile_backend.info(fileobj) + assert info.sample_rate == sample_rate + assert info.num_frames == num_frames + assert info.num_channels == num_channels + assert info.bits_per_sample == bits_per_sample + assert info.encoding == "FLAC" if ext == "flac" else "PCM_S" + + def test_fileobj_wav(self): + """Loading audio via file-like object works""" + self._test_fileobj("wav", "PCM_16", 16) + + @skipIfFormatNotSupported("FLAC") + def test_fileobj_flac(self): + """Loading audio via file-like object works""" + self._test_fileobj("flac", "PCM_16", 16) + + def _test_tarobj(self, ext, subtype, bits_per_sample): + """Query compressed audio via file-like object works""" + duration = 2 + sample_rate = 16000 + num_channels = 2 + num_frames = sample_rate * duration + audio_file = f"test.{ext}" + audio_path = self.get_temp_path(audio_file) + archive_path = self.get_temp_path("archive.tar.gz") + + #data = torch.randn(num_frames, num_channels).numpy() + data = paddle.randn(shape=[num_frames, num_channels]).numpy() + soundfile.write(audio_path, data, sample_rate, subtype=subtype) + + with tarfile.TarFile(archive_path, "w") as tarobj: + tarobj.add(audio_path, arcname=audio_file) + with tarfile.TarFile(archive_path, "r") as tarobj: + fileobj = tarobj.extractfile(audio_file) + info = soundfile_backend.info(fileobj) + assert info.sample_rate == sample_rate + assert info.num_frames == num_frames + assert info.num_channels == num_channels + assert info.bits_per_sample == bits_per_sample + assert info.encoding == "FLAC" if ext == "flac" else "PCM_S" + + def test_tarobj_wav(self): + """Query compressed audio via file-like object works""" + self._test_tarobj("wav", "PCM_16", 16) + + @skipIfFormatNotSupported("FLAC") + def test_tarobj_flac(self): + """Query compressed audio via file-like object works""" + self._test_tarobj("flac", "PCM_16", 16) + +if __name__ == '__main__': + unittest.main() diff --git a/audio/tests/backends/soundfile/load_test.py b/audio/tests/backends/soundfile/load_test.py new file mode 100644 index 000000000..d315703cb --- /dev/null +++ b/audio/tests/backends/soundfile/load_test.py @@ -0,0 +1,369 @@ +#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/load_test.py + +import os +import tarfile +import unittest +from unittest.mock import patch +import numpy as np + +from parameterized import parameterized +import paddle +from paddleaudio._internal import module_utils as _mod_utils +from paddleaudio.backends import soundfile_backend +from tests.backends.common import get_bits_per_sample, get_encoding +from tests.common_utils import ( + get_wav_data, + load_wav, + nested_params, + normalize_wav, + save_wav, + TempDirMixin, +) + +from common import dtype2subtype, parameterize, skipIfFormatNotSupported + +import soundfile + + +def _get_mock_path( + ext: str, + dtype: str, + sample_rate: int, + num_channels: int, + num_frames: int, +): + return f"{dtype}_{sample_rate}_{num_channels}_{num_frames}.{ext}" + + +def _get_mock_params(path: str): + filename, ext = path.split(".") + parts = filename.split("_") + return { + "ext": ext, + "dtype": parts[0], + "sample_rate": int(parts[1]), + "num_channels": int(parts[2]), + "num_frames": int(parts[3]), + } + + +class SoundFileMock: + def __init__(self, path, mode): + assert mode == "r" + self.path = path + self._params = _get_mock_params(path) + self._start = None + + @property + def samplerate(self): + return self._params["sample_rate"] + + @property + def format(self): + if self._params["ext"] == "wav": + return "WAV" + if self._params["ext"] == "flac": + return "FLAC" + if self._params["ext"] == "ogg": + return "OGG" + if self._params["ext"] in ["sph", "nis", "nist"]: + return "NIST" + + @property + def subtype(self): + if self._params["ext"] == "ogg": + return "VORBIS" + return dtype2subtype(self._params["dtype"]) + + def _prepare_read(self, start, stop, frames): + assert stop is None + self._start = start + return frames + + def read(self, frames, dtype, always_2d): + assert always_2d + data = get_wav_data( + dtype, + self._params["num_channels"], + normalize=False, + num_frames=self._params["num_frames"], + channels_first=False, + ).numpy() + return data[self._start : self._start + frames] + + def __enter__(self): + return self + + def __exit__(self, *args, **kwargs): + pass + + +class MockedLoadTest(unittest.TestCase): + def assert_dtype(self, ext, dtype, sample_rate, num_channels, normalize, channels_first): + """When format is WAV or NIST, normalize=False will return the native dtype Tensor, otherwise float32""" + num_frames = 3 * sample_rate + path = _get_mock_path(ext, dtype, sample_rate, num_channels, num_frames) + expected_dtype = paddle.float32 if normalize or ext not in ["wav", "nist"] else getattr(paddle, dtype) + with patch("soundfile.SoundFile", SoundFileMock): + found, sr = soundfile_backend.load(path, normalize=normalize, channels_first=channels_first) + assert found.dtype == expected_dtype + assert sample_rate == sr + + @parameterize( + ["int32", "float32", "float64"], + [8000, 16000], + [1, 2], + [True, False], + [True, False], + ) + def test_wav(self, dtype, sample_rate, num_channels, normalize, channels_first): + """Returns native dtype when normalize=False else float32""" + self.assert_dtype("wav", dtype, sample_rate, num_channels, normalize, channels_first) + + @parameterize( + ["int32"], + [8000, 16000], + [1, 2], + [True, False], + [True, False], + ) + def test_sphere(self, dtype, sample_rate, num_channels, normalize, channels_first): + """Returns float32 always""" + self.assert_dtype("sph", dtype, sample_rate, num_channels, normalize, channels_first) + + @parameterize([8000, 16000], [1, 2], [True, False], [True, False]) + def test_ogg(self, sample_rate, num_channels, normalize, channels_first): + """Returns float32 always""" + self.assert_dtype("ogg", "int16", sample_rate, num_channels, normalize, channels_first) + + @parameterize([8000, 16000], [1, 2], [True, False], [True, False]) + def test_flac(self, sample_rate, num_channels, normalize, channels_first): + """`soundfile_backend.load` can load ogg format.""" + self.assert_dtype("flac", "int16", sample_rate, num_channels, normalize, channels_first) + + +class LoadTestBase(TempDirMixin, unittest.TestCase): + def assert_wav( + self, + dtype, + sample_rate, + num_channels, + normalize, + channels_first=True, + duration=1, + ): + """`soundfile_backend.load` can load wav format correctly. + + Wav data loaded with soundfile backend should match those with scipy + """ + path = self.get_temp_path("reference.wav") + num_frames = duration * sample_rate + data = get_wav_data( + dtype, + num_channels, + normalize=normalize, + num_frames=num_frames, + channels_first=channels_first, + ) + save_wav(path, data, sample_rate, channels_first=channels_first) + expected = load_wav(path, normalize=normalize, channels_first=channels_first)[0] + data, sr = soundfile_backend.load(path, normalize=normalize, channels_first=channels_first) + assert sr == sample_rate + np.testing.assert_array_almost_equal(data.numpy(), expected.numpy()) + + def assert_sphere( + self, + dtype, + sample_rate, + num_channels, + channels_first=True, + duration=1, + ): + """`soundfile_backend.load` can load SPHERE format correctly.""" + path = self.get_temp_path("reference.sph") + num_frames = duration * sample_rate + raw = get_wav_data( + dtype, + num_channels, + num_frames=num_frames, + normalize=False, + channels_first=False, + ) + soundfile.write(path, raw, sample_rate, subtype=dtype2subtype(dtype), format="NIST") + expected = normalize_wav(raw.t() if channels_first else raw) + data, sr = soundfile_backend.load(path, channels_first=channels_first) + assert sr == sample_rate + #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8) + np.testing.assert_array_almost_equal(data.numpy(), expected.numpy()) + + def assert_flac( + self, + dtype, + sample_rate, + num_channels, + channels_first=True, + duration=1, + ): + """`soundfile_backend.load` can load FLAC format correctly.""" + path = self.get_temp_path("reference.flac") + num_frames = duration * sample_rate + raw = get_wav_data( + dtype, + num_channels, + num_frames=num_frames, + normalize=False, + channels_first=False, + ) + soundfile.write(path, raw, sample_rate) + expected = normalize_wav(raw.t() if channels_first else raw) + data, sr = soundfile_backend.load(path, channels_first=channels_first) + assert sr == sample_rate + #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8) + np.testing.assert_array_almost_equal(data.numpy(), expected.numpy()) + + + +class TestLoad(LoadTestBase): + """Test the correctness of `soundfile_backend.load` for various formats""" + + @parameterize( + ["float32", "int32"], + [8000, 16000], + [1, 2], + [False, True], + [False, True], + ) + def test_wav(self, dtype, sample_rate, num_channels, normalize, channels_first): + """`soundfile_backend.load` can load wav format correctly.""" + self.assert_wav(dtype, sample_rate, num_channels, normalize, channels_first) + + @parameterize( + ["int32"], + [16000], + [2], + [False], + ) + def test_wav_large(self, dtype, sample_rate, num_channels, normalize): + """`soundfile_backend.load` can load large wav file correctly.""" + two_hours = 2 * 60 * 60 + self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=two_hours) + + @parameterize(["float32", "int32"], [4, 8, 16, 32], [False, True]) + def test_multiple_channels(self, dtype, num_channels, channels_first): + """`soundfile_backend.load` can load wav file with more than 2 channels.""" + sample_rate = 8000 + normalize = False + self.assert_wav(dtype, sample_rate, num_channels, normalize, channels_first) + + #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True]) + #@skipIfFormatNotSupported("NIST") + #def test_sphere(self, dtype, sample_rate, num_channels, channels_first): + #"""`soundfile_backend.load` can load sphere format correctly.""" + #self.assert_sphere(dtype, sample_rate, num_channels, channels_first) + + #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True]) + #@skipIfFormatNotSupported("FLAC") + #def test_flac(self, dtype, sample_rate, num_channels, channels_first): + #"""`soundfile_backend.load` can load flac format correctly.""" + #self.assert_flac(dtype, sample_rate, num_channels, channels_first) + + +class TestLoadFormat(TempDirMixin, unittest.TestCase): + """Given `format` parameter, `so.load` can load files without extension""" + + original = None + path = None + + def _make_file(self, format_): + sample_rate = 8000 + path_with_ext = self.get_temp_path(f"test.{format_}") + data = get_wav_data("float32", num_channels=2).numpy().T + soundfile.write(path_with_ext, data, sample_rate) + expected = soundfile.read(path_with_ext, dtype="float32")[0].T + path = os.path.splitext(path_with_ext)[0] + os.rename(path_with_ext, path) + return path, expected + + def _test_format(self, format_): + """Providing format allows to read file without extension""" + path, expected = self._make_file(format_) + found, _ = soundfile_backend.load(path) + #self.assertEqual(found, expected) + np.testing.assert_array_almost_equal(found, expected) + + @parameterized.expand( + [ + ("WAV",), + ("wav",), + ] + ) + def test_wav(self, format_): + self._test_format(format_) + + @parameterized.expand( + [ + ("FLAC",), + ("flac",), + ] + ) + @skipIfFormatNotSupported("FLAC") + def test_flac(self, format_): + self._test_format(format_) + + +class TestFileObject(TempDirMixin, unittest.TestCase): + def _test_fileobj(self, ext): + """Loading audio via file-like object works""" + sample_rate = 16000 + path = self.get_temp_path(f"test.{ext}") + + data = get_wav_data("float32", num_channels=2).numpy().T + soundfile.write(path, data, sample_rate) + expected = soundfile.read(path, dtype="float32")[0].T + + with open(path, "rb") as fileobj: + found, sr = soundfile_backend.load(fileobj) + assert sr == sample_rate + #self.assertEqual(expected, found) + np.testing.assert_array_almost_equal(found, expected) + + def test_fileobj_wav(self): + """Loading audio via file-like object works""" + self._test_fileobj("wav") + + def test_fileobj_flac(self): + """Loading audio via file-like object works""" + self._test_fileobj("flac") + + def _test_tarfile(self, ext): + """Loading audio via file-like object works""" + sample_rate = 16000 + audio_file = f"test.{ext}" + audio_path = self.get_temp_path(audio_file) + archive_path = self.get_temp_path("archive.tar.gz") + + data = get_wav_data("float32", num_channels=2).numpy().T + soundfile.write(audio_path, data, sample_rate) + expected = soundfile.read(audio_path, dtype="float32")[0].T + + with tarfile.TarFile(archive_path, "w") as tarobj: + tarobj.add(audio_path, arcname=audio_file) + with tarfile.TarFile(archive_path, "r") as tarobj: + fileobj = tarobj.extractfile(audio_file) + found, sr = soundfile_backend.load(fileobj) + + assert sr == sample_rate + #self.assertEqual(expected, found) + np.testing.assert_array_almost_equal(found.numpy(), expected) + + + def test_tarfile_wav(self): + """Loading audio via file-like object works""" + self._test_tarfile("wav") + + def test_tarfile_flac(self): + """Loading audio via file-like object works""" + self._test_tarfile("flac") + +if __name__ == '__main__': + unittest.main() diff --git a/audio/tests/backends/soundfile/save_test.py b/audio/tests/backends/soundfile/save_test.py new file mode 100644 index 000000000..28f0e5c79 --- /dev/null +++ b/audio/tests/backends/soundfile/save_test.py @@ -0,0 +1,322 @@ +import io +import unittest +from unittest.mock import patch + +from paddleaudio._internal import module_utils as _mod_utils +from paddleaudio.backends import soundfile_backend +from tests.common_utils import ( + get_wav_data, + load_wav, + nested_params, + normalize_wav, + save_wav, + TempDirMixin, +) + +from common import fetch_wav_subtype, parameterize, skipIfFormatNotSupported + +import paddle +import numpy as np + +import soundfile + + +class MockedSaveTest(unittest.TestCase): + @nested_params( + ["float32", "int32"], + [8000, 16000], + [1, 2], + [False, True], + [ + (None, None), + ("PCM_U", None), + ("PCM_U", 8), + ("PCM_S", None), + ("PCM_S", 16), + ("PCM_S", 32), + ("PCM_F", None), + ("PCM_F", 32), + ("PCM_F", 64), + ("ULAW", None), + ("ULAW", 8), + ("ALAW", None), + ("ALAW", 8), + ], + ) + @patch("soundfile.write") + def test_wav(self, dtype, sample_rate, num_channels, channels_first, enc_params, mocked_write): + """soundfile_backend.save passes correct subtype to soundfile.write when WAV""" + filepath = "foo.wav" + input_tensor = get_wav_data( + dtype, + num_channels, + num_frames=3 * sample_rate, + normalize=dtype == "float32", + channels_first=channels_first, + ) + input_tensor = paddle.transpose(input_tensor, [1, 0]) + + encoding, bits_per_sample = enc_params + soundfile_backend.save( + filepath, + input_tensor, + sample_rate, + channels_first=channels_first, + encoding=encoding, + bits_per_sample=bits_per_sample, + ) + + # on +Py3.8 call_args.kwargs is more descreptive + args = mocked_write.call_args[1] + assert args["file"] == filepath + assert args["samplerate"] == sample_rate + assert args["subtype"] == fetch_wav_subtype(dtype, encoding, bits_per_sample) + assert args["format"] is None + tensor_result = paddle.transpose(input_tensor, [1, 0]) if channels_first else input_tensor + #self.assertEqual(args["data"], tensor_result.numpy()) + np.testing.assert_array_almost_equal(args["data"].numpy(), tensor_result.numpy()) + + + + @patch("soundfile.write") + def assert_non_wav( + self, + fmt, + dtype, + sample_rate, + num_channels, + channels_first, + mocked_write, + encoding=None, + bits_per_sample=None, + ): + """soundfile_backend.save passes correct subtype and format to soundfile.write when SPHERE""" + filepath = f"foo.{fmt}" + input_tensor = get_wav_data( + dtype, + num_channels, + num_frames=3 * sample_rate, + normalize=False, + channels_first=channels_first, + ) + input_tensor = paddle.transpose(input_tensor, [1, 0]) + + expected_data = paddle.transpose(input_tensor, [1, 0]) if channels_first else input_tensor + + soundfile_backend.save( + filepath, + input_tensor, + sample_rate, + channels_first, + encoding=encoding, + bits_per_sample=bits_per_sample, + ) + + # on +Py3.8 call_args.kwargs is more descreptive + args = mocked_write.call_args[1] + assert args["file"] == filepath + assert args["samplerate"] == sample_rate + if fmt in ["sph", "nist", "nis"]: + assert args["format"] == "NIST" + else: + assert args["format"] is None + np.testing.assert_array_almost_equal(args["data"].numpy(), expected_data.numpy()) + #self.assertEqual(args["data"], expected_data) + + @nested_params( + ["sph", "nist", "nis"], + ["int32"], + [8000, 16000], + [1, 2], + [False, True], + [ + ("PCM_S", 8), + ("PCM_S", 16), + ("PCM_S", 24), + ("PCM_S", 32), + ("ULAW", 8), + ("ALAW", 8), + ("ALAW", 16), + ("ALAW", 24), + ("ALAW", 32), + ], + ) + def test_sph(self, fmt, dtype, sample_rate, num_channels, channels_first, enc_params): + """soundfile_backend.save passes default format and subtype (None-s) to + soundfile.write when not WAV""" + encoding, bits_per_sample = enc_params + self.assert_non_wav( + fmt, dtype, sample_rate, num_channels, channels_first, encoding=encoding, bits_per_sample=bits_per_sample + ) + + @parameterize( + ["int32"], + [8000, 16000], + [1, 2], + [False, True], + [8, 16, 24], + ) + def test_flac(self, dtype, sample_rate, num_channels, channels_first, bits_per_sample): + """soundfile_backend.save passes default format and subtype (None-s) to + soundfile.write when not WAV""" + self.assert_non_wav("flac", dtype, sample_rate, num_channels, channels_first, bits_per_sample=bits_per_sample) + + @parameterize( + ["int32"], + [8000, 16000], + [1, 2], + [False, True], + ) + def test_ogg(self, dtype, sample_rate, num_channels, channels_first): + """soundfile_backend.save passes default format and subtype (None-s) to + soundfile.write when not WAV""" + self.assert_non_wav("ogg", dtype, sample_rate, num_channels, channels_first) + + +class SaveTestBase(TempDirMixin, unittest.TestCase): + def assert_wav(self, dtype, sample_rate, num_channels, num_frames): + """`soundfile_backend.save` can save wav format.""" + path = self.get_temp_path("data.wav") + expected = get_wav_data(dtype, num_channels, num_frames=num_frames, normalize=False) + soundfile_backend.save(path, expected, sample_rate) + found, sr = load_wav(path, normalize=False) + assert sample_rate == sr + #self.assertEqual(found, expected) + np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) + + def _assert_non_wav(self, fmt, dtype, sample_rate, num_channels): + """`soundfile_backend.save` can save non-wav format. + + Due to precision missmatch, and the lack of alternative way to decode the + resulting files without using soundfile, only meta data are validated. + """ + num_frames = sample_rate * 3 + path = self.get_temp_path(f"data.{fmt}") + expected = get_wav_data(dtype, num_channels, num_frames=num_frames, normalize=False) + soundfile_backend.save(path, expected, sample_rate) + sinfo = soundfile.info(path) + assert sinfo.format == fmt.upper() + #assert sinfo.frames == num_frames this go wrong + assert sinfo.channels == num_channels + assert sinfo.samplerate == sample_rate + + def assert_flac(self, dtype, sample_rate, num_channels): + """`soundfile_backend.save` can save flac format.""" + self._assert_non_wav("flac", dtype, sample_rate, num_channels) + + def assert_sphere(self, dtype, sample_rate, num_channels): + """`soundfile_backend.save` can save sph format.""" + self._assert_non_wav("nist", dtype, sample_rate, num_channels) + + def assert_ogg(self, dtype, sample_rate, num_channels): + """`soundfile_backend.save` can save ogg format. + + As we cannot inspect the OGG format (it's lossy), we only check the metadata. + """ + self._assert_non_wav("ogg", dtype, sample_rate, num_channels) + + +class TestSave(SaveTestBase): + @parameterize( + ["float32", "int32"], + [8000, 16000], + [1, 2], + ) + def test_wav(self, dtype, sample_rate, num_channels): + """`soundfile_backend.save` can save wav format.""" + self.assert_wav(dtype, sample_rate, num_channels, num_frames=None) + + @parameterize( + ["float32", "int32"], + [4, 8, 16, 32], + ) + def test_multiple_channels(self, dtype, num_channels): + """`soundfile_backend.save` can save wav with more than 2 channels.""" + sample_rate = 8000 + self.assert_wav(dtype, sample_rate, num_channels, num_frames=None) + + @parameterize( + ["int32"], + [8000, 16000], + [1, 2], + ) + @skipIfFormatNotSupported("NIST") + def test_sphere(self, dtype, sample_rate, num_channels): + """`soundfile_backend.save` can save sph format.""" + self.assert_sphere(dtype, sample_rate, num_channels) + + @parameterize( + [8000, 16000], + [1, 2], + ) + @skipIfFormatNotSupported("FLAC") + def test_flac(self, sample_rate, num_channels): + """`soundfile_backend.save` can save flac format.""" + self.assert_flac("float32", sample_rate, num_channels) + + @parameterize( + [8000, 16000], + [1, 2], + ) + @skipIfFormatNotSupported("OGG") + def test_ogg(self, sample_rate, num_channels): + """`soundfile_backend.save` can save ogg/vorbis format.""" + self.assert_ogg("float32", sample_rate, num_channels) + + +class TestSaveParams(TempDirMixin, unittest.TestCase): + """Test the correctness of optional parameters of `soundfile_backend.save`""" + + @parameterize([True, False]) + def test_channels_first(self, channels_first): + """channels_first swaps axes""" + path = self.get_temp_path("data.wav") + data = get_wav_data("int32", 2, channels_first=channels_first) + soundfile_backend.save(path, data, 8000, channels_first=channels_first) + found = load_wav(path)[0] + expected = data if channels_first else data.transpose([1, 0]) + #self.assertEqual(found, expected, atol=1e-4, rtol=1e-8) + np.testing.assert_array_almost_equal(found.numpy(), expected.numpy()) + + +class TestFileObject(TempDirMixin, unittest.TestCase): + def _test_fileobj(self, ext): + """Saving audio to file-like object works""" + sample_rate = 16000 + path = self.get_temp_path(f"test.{ext}") + + subtype = "FLOAT" if ext == "wav" else None + data = get_wav_data("float32", num_channels=2) + soundfile.write(path, data.numpy().T, sample_rate, subtype=subtype) + expected = soundfile.read(path, dtype="float32")[0] + + fileobj = io.BytesIO() + soundfile_backend.save(fileobj, data, sample_rate, format=ext) + fileobj.seek(0) + found, sr = soundfile.read(fileobj, dtype="float32") + + assert sr == sample_rate + #self.assertEqual(expected, found, atol=1e-4, rtol=1e-8) + np.testing.assert_array_almost_equal(found, expected) + + def test_fileobj_wav(self): + """Saving audio via file-like object works""" + self._test_fileobj("wav") + + @skipIfFormatNotSupported("FLAC") + def test_fileobj_flac(self): + """Saving audio via file-like object works""" + self._test_fileobj("flac") + + @skipIfFormatNotSupported("NIST") + def test_fileobj_nist(self): + """Saving audio via file-like object works""" + self._test_fileobj("NIST") + + @skipIfFormatNotSupported("OGG") + def test_fileobj_ogg(self): + """Saving audio via file-like object works""" + self._test_fileobj("OGG") + +if __name__ == '__main__': + unittest.main() diff --git a/audio/tests/backends/soundfile/test_io.py b/audio/tests/backends/soundfile/test_io.py new file mode 100644 index 000000000..eed1b39fb --- /dev/null +++ b/audio/tests/backends/soundfile/test_io.py @@ -0,0 +1,74 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import filecmp +import os +import unittest + +import numpy as np +from paddleaudio.backends import soundfile_load as load +from paddleaudio.backends import soundfile_save as save +import soundfile as sf + +from base import BackendTest + + +class TestIO(BackendTest): + def test_load_mono_channel(self): + sf_data, sf_sr = sf.read(self.files[0]) + pa_data, pa_sr = load( + self.files[0], normal=False, dtype='float64') + + self.assertEqual(sf_data.dtype, pa_data.dtype) + self.assertEqual(sf_sr, pa_sr) + np.testing.assert_array_almost_equal(sf_data, pa_data) + + def test_load_multi_channels(self): + sf_data, sf_sr = sf.read(self.files[1]) + sf_data = sf_data.T # Channel dim first + pa_data, pa_sr = load( + self.files[1], mono=False, normal=False, dtype='float64') + + self.assertEqual(sf_data.dtype, pa_data.dtype) + self.assertEqual(sf_sr, pa_sr) + np.testing.assert_array_almost_equal(sf_data, pa_data) + + def test_save_mono_channel(self): + waveform, sr = np.random.randint( + low=-32768, high=32768, size=(48000), dtype=np.int16), 16000 + sf_tmp_file = 'sf_tmp.wav' + pa_tmp_file = 'pa_tmp.wav' + + sf.write(sf_tmp_file, waveform, sr) + save(waveform, sr, pa_tmp_file) + + self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file)) + for file in [sf_tmp_file, pa_tmp_file]: + os.remove(file) + + def test_save_multi_channels(self): + waveform, sr = np.random.randint( + low=-32768, high=32768, size=(2, 48000), dtype=np.int16), 16000 + sf_tmp_file = 'sf_tmp.wav' + pa_tmp_file = 'pa_tmp.wav' + + sf.write(sf_tmp_file, waveform.T, sr) + save(waveform.T, sr, pa_tmp_file) + + self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file)) + for file in [sf_tmp_file, pa_tmp_file]: + os.remove(file) + + +if __name__ == '__main__': + unittest.main() diff --git a/audio/tests/benchmark/README.md b/audio/tests/benchmark/README.md new file mode 100644 index 000000000..b9034100d --- /dev/null +++ b/audio/tests/benchmark/README.md @@ -0,0 +1,39 @@ +# 1. Prepare +First, install `pytest-benchmark` via pip. +```sh +pip install pytest-benchmark +``` + +# 2. Run +Run the specific script for profiling. +```sh +pytest melspectrogram.py +``` + +Result: +```sh +========================================================================== test session starts ========================================================================== +platform linux -- Python 3.7.7, pytest-7.0.1, pluggy-1.0.0 +benchmark: 3.4.1 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000) +rootdir: /ssd3/chenxiaojie06/PaddleSpeech/DeepSpeech/paddleaudio +plugins: typeguard-2.12.1, benchmark-3.4.1, anyio-3.5.0 +collected 4 items + +melspectrogram.py .... [100%] + + +-------------------------------------------------------------------------------------------------- benchmark: 4 tests ------------------------------------------------------------------------------------------------- +Name (time in us) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +test_melspect_gpu_torchaudio 202.0765 (1.0) 360.6230 (1.0) 218.1168 (1.0) 16.3022 (1.0) 214.2871 (1.0) 21.8451 (1.0) 40;3 4,584.7001 (1.0) 286 1 +test_melspect_gpu 657.8509 (3.26) 908.0470 (2.52) 724.2545 (3.32) 106.5771 (6.54) 669.9096 (3.13) 113.4719 (5.19) 1;0 1,380.7300 (0.30) 5 1 +test_melspect_cpu_torchaudio 1,247.6053 (6.17) 2,892.5799 (8.02) 1,443.2853 (6.62) 345.3732 (21.19) 1,262.7263 (5.89) 221.6385 (10.15) 56;53 692.8637 (0.15) 399 1 +test_melspect_cpu 20,326.2549 (100.59) 20,607.8682 (57.15) 20,473.4125 (93.86) 63.8654 (3.92) 20,467.0429 (95.51) 68.4294 (3.13) 8;1 48.8438 (0.01) 29 1 +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +Legend: + Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile. + OPS: Operations Per Second, computed as 1 / Mean +========================================================================== 4 passed in 21.12s =========================================================================== + +``` diff --git a/audio/tests/benchmark/log_melspectrogram.py b/audio/tests/benchmark/log_melspectrogram.py new file mode 100644 index 000000000..79b5406d2 --- /dev/null +++ b/audio/tests/benchmark/log_melspectrogram.py @@ -0,0 +1,123 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import urllib.request + +import librosa +import numpy as np +import paddle +import paddleaudio +import torch +import torchaudio + +wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' +if not os.path.isfile(os.path.basename(wav_url)): + urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) + +waveform, sr = paddleaudio.backends.soundfile_load(os.path.abspath(os.path.basename(wav_url))) +waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) +waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) + +# Feature conf +mel_conf = { + 'sr': sr, + 'n_fft': 512, + 'hop_length': 128, + 'n_mels': 40, +} + +mel_conf_torchaudio = { + 'sample_rate': sr, + 'n_fft': 512, + 'hop_length': 128, + 'n_mels': 40, + 'norm': 'slaney', + 'mel_scale': 'slaney', +} + + +def enable_cpu_device(): + paddle.set_device('cpu') + + +def enable_gpu_device(): + paddle.set_device('gpu') + + +log_mel_extractor = paddleaudio.features.LogMelSpectrogram( + **mel_conf, f_min=0.0, top_db=80.0, dtype=waveform_tensor.dtype) + + +def log_melspectrogram(): + return log_mel_extractor(waveform_tensor).squeeze(0) + + +def test_log_melspect_cpu(benchmark): + enable_cpu_device() + feature_paddleaudio = benchmark(log_melspectrogram) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +def test_log_melspect_gpu(benchmark): + enable_gpu_device() + feature_paddleaudio = benchmark(log_melspectrogram) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=2) + + +mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram( + **mel_conf_torchaudio, f_min=0.0) +amplitude_to_DB = torchaudio.transforms.AmplitudeToDB('power', top_db=80.0) + + +def melspectrogram_torchaudio(): + return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0) + + +def log_melspectrogram_torchaudio(): + mel_specgram = mel_extractor_torchaudio(waveform_tensor_torch) + return amplitude_to_DB(mel_specgram).squeeze(0) + + +def test_log_melspect_cpu_torchaudio(benchmark): + global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB + + mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu') + waveform_tensor_torch = waveform_tensor_torch.to('cpu') + amplitude_to_DB = amplitude_to_DB.to('cpu') + + feature_paddleaudio = benchmark(log_melspectrogram_torchaudio) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +def test_log_melspect_gpu_torchaudio(benchmark): + global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB + + mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda') + waveform_tensor_torch = waveform_tensor_torch.to('cuda') + amplitude_to_DB = amplitude_to_DB.to('cuda') + + feature_torchaudio = benchmark(log_melspectrogram_torchaudio) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) + np.testing.assert_array_almost_equal( + feature_librosa, feature_torchaudio.cpu(), decimal=2) diff --git a/audio/tests/benchmark/melspectrogram.py b/audio/tests/benchmark/melspectrogram.py new file mode 100644 index 000000000..34e65bcb5 --- /dev/null +++ b/audio/tests/benchmark/melspectrogram.py @@ -0,0 +1,107 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import urllib.request + +import librosa +import numpy as np +import paddle +import paddleaudio +import torch +import torchaudio + +wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' +if not os.path.isfile(os.path.basename(wav_url)): + urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) + +waveform, sr = paddleaudio.backends.soundfile_load(os.path.abspath(os.path.basename(wav_url))) +waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) +waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) + +# Feature conf +mel_conf = { + 'sr': sr, + 'n_fft': 512, + 'hop_length': 128, + 'n_mels': 40, +} + +mel_conf_torchaudio = { + 'sample_rate': sr, + 'n_fft': 512, + 'hop_length': 128, + 'n_mels': 40, + 'norm': 'slaney', + 'mel_scale': 'slaney', +} + + +def enable_cpu_device(): + paddle.set_device('cpu') + + +def enable_gpu_device(): + paddle.set_device('gpu') + + +mel_extractor = paddleaudio.features.MelSpectrogram( + **mel_conf, f_min=0.0, dtype=waveform_tensor.dtype) + + +def melspectrogram(): + return mel_extractor(waveform_tensor).squeeze(0) + + +def test_melspect_cpu(benchmark): + enable_cpu_device() + feature_paddleaudio = benchmark(melspectrogram) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +def test_melspect_gpu(benchmark): + enable_gpu_device() + feature_paddleaudio = benchmark(melspectrogram) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram( + **mel_conf_torchaudio, f_min=0.0) + + +def melspectrogram_torchaudio(): + return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0) + + +def test_melspect_cpu_torchaudio(benchmark): + global waveform_tensor_torch, mel_extractor_torchaudio + mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu') + waveform_tensor_torch = waveform_tensor_torch.to('cpu') + feature_paddleaudio = benchmark(melspectrogram_torchaudio) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +def test_melspect_gpu_torchaudio(benchmark): + global waveform_tensor_torch, mel_extractor_torchaudio + mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda') + waveform_tensor_torch = waveform_tensor_torch.to('cuda') + feature_torchaudio = benchmark(melspectrogram_torchaudio) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_torchaudio.cpu(), decimal=3) diff --git a/audio/tests/benchmark/mfcc.py b/audio/tests/benchmark/mfcc.py new file mode 100644 index 000000000..4173c4bec --- /dev/null +++ b/audio/tests/benchmark/mfcc.py @@ -0,0 +1,121 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import urllib.request + +import librosa +import numpy as np +import paddle +import paddleaudio +import torch +import torchaudio + +wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' +if not os.path.isfile(os.path.basename(wav_url)): + urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) + +waveform, sr = paddleaudio.backends.soundfile_load(os.path.abspath(os.path.basename(wav_url))) +waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) +waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) + +# Feature conf +mel_conf = { + 'sr': sr, + 'n_fft': 512, + 'hop_length': 128, + 'n_mels': 40, +} +mfcc_conf = { + 'n_mfcc': 20, + 'top_db': 80.0, +} +mfcc_conf.update(mel_conf) + +mel_conf_torchaudio = { + 'sample_rate': sr, + 'n_fft': 512, + 'hop_length': 128, + 'n_mels': 40, + 'norm': 'slaney', + 'mel_scale': 'slaney', +} +mfcc_conf_torchaudio = { + 'sample_rate': sr, + 'n_mfcc': 20, +} + + +def enable_cpu_device(): + paddle.set_device('cpu') + + +def enable_gpu_device(): + paddle.set_device('gpu') + + +mfcc_extractor = paddleaudio.features.MFCC( + **mfcc_conf, f_min=0.0, dtype=waveform_tensor.dtype) + + +def mfcc(): + return mfcc_extractor(waveform_tensor).squeeze(0) + + +def test_mfcc_cpu(benchmark): + enable_cpu_device() + feature_paddleaudio = benchmark(mfcc) + feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +def test_mfcc_gpu(benchmark): + enable_gpu_device() + feature_paddleaudio = benchmark(mfcc) + feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +del mel_conf_torchaudio['sample_rate'] +mfcc_extractor_torchaudio = torchaudio.transforms.MFCC( + **mfcc_conf_torchaudio, melkwargs=mel_conf_torchaudio) + + +def mfcc_torchaudio(): + return mfcc_extractor_torchaudio(waveform_tensor_torch).squeeze(0) + + +def test_mfcc_cpu_torchaudio(benchmark): + global waveform_tensor_torch, mfcc_extractor_torchaudio + + mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cpu') + waveform_tensor_torch = waveform_tensor_torch.to('cpu') + + feature_paddleaudio = benchmark(mfcc_torchaudio) + feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +def test_mfcc_gpu_torchaudio(benchmark): + global waveform_tensor_torch, mfcc_extractor_torchaudio + + mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cuda') + waveform_tensor_torch = waveform_tensor_torch.to('cuda') + + feature_torchaudio = benchmark(mfcc_torchaudio) + feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_torchaudio.cpu(), decimal=3) diff --git a/audio/tests/common_utils/__init__.py b/audio/tests/common_utils/__init__.py new file mode 100644 index 000000000..32b785124 --- /dev/null +++ b/audio/tests/common_utils/__init__.py @@ -0,0 +1,17 @@ +from .wav_utils import get_wav_data, load_wav, save_wav, normalize_wav +from .parameterized_utils import nested_params +from .case_utils import ( + TempDirMixin, + name_func +) + +__all__ = [ + "get_wav_data", + "load_wav", + "save_wav", + "normalize_wav", + "get_sinusoid", + "name_func", + "nested_params", + "TempDirMixin" +] diff --git a/audio/tests/common_utils/case_utils.py b/audio/tests/common_utils/case_utils.py new file mode 100644 index 000000000..328c3de43 --- /dev/null +++ b/audio/tests/common_utils/case_utils.py @@ -0,0 +1,56 @@ +import functools +import os.path +import shutil +import subprocess +import sys +import tempfile +import time +import unittest + +#code is from:https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/common_utils/case_utils.py + +import paddle + +def name_func(func, _, params): + return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}' + +class TempDirMixin: + """Mixin to provide easy access to temp dir""" + + temp_dir_ = None + + @classmethod + def get_base_temp_dir(cls): + # If PADDLEAUDIO_TEST_TEMP_DIR is set, use it instead of temporary directory. + # this is handy for debugging. + key = "PADDLEAUDIO_TEST_TEMP_DIR" + if key in os.environ: + return os.environ[key] + if cls.temp_dir_ is None: + cls.temp_dir_ = tempfile.TemporaryDirectory() + return cls.temp_dir_.name + + @classmethod + def tearDownClass(cls): + if cls.temp_dir_ is not None: + try: + cls.temp_dir_.cleanup() + cls.temp_dir_ = None + except PermissionError: + # On Windows there is a know issue with `shutil.rmtree`, + # which fails intermittenly. + # + # https://github.com/python/cpython/issues/74168 + # + # We observed this on CircleCI, where Windows job raises + # PermissionError. + # + # Following the above thread, we ignore it. + pass + super().tearDownClass() + + def get_temp_path(self, *paths): + temp_dir = os.path.join(self.get_base_temp_dir(), self.id()) + path = os.path.join(temp_dir, *paths) + os.makedirs(os.path.dirname(path), exist_ok=True) + return path diff --git a/audio/tests/common_utils/parameterized_utils.py b/audio/tests/common_utils/parameterized_utils.py new file mode 100644 index 000000000..d27c27469 --- /dev/null +++ b/audio/tests/common_utils/parameterized_utils.py @@ -0,0 +1,43 @@ +import json +from itertools import product +import os + +from parameterized import param, parameterized + +def _name_func(func, _, params): + strs = [] + for arg in params.args: + if isinstance(arg, tuple): + strs.append("_".join(str(a) for a in arg)) + else: + strs.append(str(arg)) + # sanitize the test name + name = "_".join(strs) + return parameterized.to_safe_name(f"{func.__name__}_{name}") + + +def nested_params(*params_set, name_func=_name_func): + """Generate the cartesian product of the given list of parameters. + + Args: + params_set (list of parameters): Parameters. When using ``parameterized.param`` class, + all the parameters have to be specified with the class, only using kwargs. + """ + flatten = [p for params in params_set for p in params] + + # Parameters to be nested are given as list of plain objects + if all(not isinstance(p, param) for p in flatten): + args = list(product(*params_set)) + return parameterized.expand(args, name_func=_name_func) + + # Parameters to be nested are given as list of `parameterized.param` + if not all(isinstance(p, param) for p in flatten): + raise TypeError("When using ``parameterized.param``, " "all the parameters have to be of the ``param`` type.") + if any(p.args for p in flatten): + raise ValueError( + "When using ``parameterized.param``, " "all the parameters have to be provided as keyword argument." + ) + args = [param()] + for params in params_set: + args = [param(**x.kwargs, **y.kwargs) for x in args for y in params] + return parameterized.expand(args) diff --git a/audio/tests/common_utils/wav_utils.py b/audio/tests/common_utils/wav_utils.py new file mode 100644 index 000000000..25d0b1971 --- /dev/null +++ b/audio/tests/common_utils/wav_utils.py @@ -0,0 +1,102 @@ +from typing import Optional + +import scipy.io.wavfile +import paddle +import numpy as np + +def normalize_wav(tensor: paddle.Tensor) -> paddle.Tensor: + if tensor.dtype == paddle.float32: + pass + elif tensor.dtype == paddle.int32: + tensor = paddle.cast(tensor, paddle.float32) + tensor[tensor > 0] /= 2147483647.0 + tensor[tensor < 0] /= 2147483648.0 + elif tensor.dtype == paddle.int16: + tensor = paddle.cast(tensor, paddle.float32) + tensor[tensor > 0] /= 32767.0 + tensor[tensor < 0] /= 32768.0 + elif tensor.dtype == paddle.uint8: + tensor = paddle.cast(tensor, paddle.float32) - 128 + tensor[tensor > 0] /= 127.0 + tensor[tensor < 0] /= 128.0 + return tensor + + +def get_wav_data( + dtype: str, + num_channels: int, + *, + num_frames: Optional[int] = None, + normalize: bool = True, + channels_first: bool = True, +): + """Generate linear signal of the given dtype and num_channels + + Data range is + [-1.0, 1.0] for float32, + [-2147483648, 2147483647] for int32 + [-32768, 32767] for int16 + [0, 255] for uint8 + + num_frames allow to change the linear interpolation parameter. + Default values are 256 for uint8, else 1 << 16. + 1 << 16 as default is so that int16 value range is completely covered. + """ + dtype_ = getattr(paddle, dtype) + + if num_frames is None: + if dtype == "uint8": + num_frames = 256 + else: + num_frames = 1 << 16 + + # paddle linspace not support uint8, int8, int16 + #if dtype == "uint8": + # base = paddle.linspace(0, 255, num_frames, dtype=dtype_) + #dtype_np = getattr(np, dtype) + #base_np = np.linspace(0, 255, num_frames, dtype_np) + #base = paddle.to_tensor(base_np, dtype=dtype_) + #elif dtype == "int8": + # base = paddle.linspace(-128, 127, num_frames, dtype=dtype_) + #dtype_np = getattr(np, dtype) + #base_np = np.linspace(-128, 127, num_frames, dtype_np) + #base = paddle.to_tensor(base_np, dtype=dtype_) + if dtype == "float32": + base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "float64": + base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "int32": + base = paddle.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) + #elif dtype == "int16": + # base = paddle.linspace(-32768, 32767, num_frames, dtype=dtype_) + #dtype_np = getattr(np, dtype) + #base_np = np.linspace(-32768, 32767, num_frames, dtype_np) + #base = paddle.to_tensor(base_np, dtype=dtype_) + else: + raise NotImplementedError(f"Unsupported dtype {dtype}") + data = base.tile([num_channels, 1]) + if not channels_first: + data = data.transpose([1, 0]) + if normalize: + data = normalize_wav(data) + return data + + +def load_wav(path: str, normalize=True, channels_first=True) -> paddle.Tensor: + """Load wav file without paddleaudio""" + sample_rate, data = scipy.io.wavfile.read(path) + data = paddle.to_tensor(data.copy()) + if data.ndim == 1: + data = data.unsqueeze(1) + if normalize: + data = normalize_wav(data) + if channels_first: + data = data.transpose([1, 0]) + return data, sample_rate + + +def save_wav(path, data, sample_rate, channels_first=True): + """Save wav file without paddleaudio""" + if channels_first: + data = data.transpose([1, 0]) + scipy.io.wavfile.write(path, sample_rate, data.numpy()) diff --git a/audio/tests/features/__init__.py b/audio/tests/features/__init__.py new file mode 100644 index 000000000..97043fd7b --- /dev/null +++ b/audio/tests/features/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/audio/tests/features/base.py b/audio/tests/features/base.py new file mode 100644 index 000000000..d183b72ad --- /dev/null +++ b/audio/tests/features/base.py @@ -0,0 +1,48 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +import urllib.request + +import numpy as np +import paddle +from paddleaudio.backends import soundfile_load as load + +wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' + + +class FeatTest(unittest.TestCase): + def setUp(self): + self.initParmas() + self.initWavInput() + self.setUpDevice() + + def setUpDevice(self, device='cpu'): + paddle.set_device(device) + + def initWavInput(self, url=wav_url): + if not os.path.isfile(os.path.basename(url)): + urllib.request.urlretrieve(url, os.path.basename(url)) + self.waveform, self.sr = load(os.path.abspath(os.path.basename(url))) + self.waveform = self.waveform.astype( + np.float32 + ) # paddlespeech.s2t.transform.spectrogram only supports float32 + dim = len(self.waveform.shape) + + assert dim in [1, 2] + if dim == 1: + self.waveform = np.expand_dims(self.waveform, 0) + + def initParmas(self): + raise NotImplementedError diff --git a/audio/tests/features/test_kaldi.py b/audio/tests/features/test_kaldi.py new file mode 100644 index 000000000..2bd5dc734 --- /dev/null +++ b/audio/tests/features/test_kaldi.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +import paddle +import paddleaudio +import torch +import torchaudio + +from base import FeatTest + + +class TestKaldi(FeatTest): + def initParmas(self): + self.window_size = 1024 + self.dtype = 'float32' + + def test_window(self): + t_hann_window = torch.hann_window( + self.window_size, periodic=False, dtype=eval(f'torch.{self.dtype}')) + t_hamm_window = torch.hamming_window( + self.window_size, + periodic=False, + alpha=0.54, + beta=0.46, + dtype=eval(f'torch.{self.dtype}')) + t_povey_window = torch.hann_window( + self.window_size, periodic=False, + dtype=eval(f'torch.{self.dtype}')).pow(0.85) + + p_hann_window = paddleaudio.functional.window.get_window( + 'hann', + self.window_size, + fftbins=False, + dtype=eval(f'paddle.{self.dtype}')) + p_hamm_window = paddleaudio.functional.window.get_window( + 'hamming', + self.window_size, + fftbins=False, + dtype=eval(f'paddle.{self.dtype}')) + p_povey_window = paddleaudio.functional.window.get_window( + 'hann', + self.window_size, + fftbins=False, + dtype=eval(f'paddle.{self.dtype}')).pow(0.85) + + np.testing.assert_array_almost_equal(t_hann_window, p_hann_window) + np.testing.assert_array_almost_equal(t_hamm_window, p_hamm_window) + np.testing.assert_array_almost_equal(t_povey_window, p_povey_window) + + def test_fbank(self): + ta_features = torchaudio.compliance.kaldi.fbank( + torch.from_numpy(self.waveform.astype(self.dtype))) + pa_features = paddleaudio.compliance.kaldi.fbank( + paddle.to_tensor(self.waveform.astype(self.dtype))) + np.testing.assert_array_almost_equal( + ta_features, pa_features, decimal=4) + + def test_mfcc(self): + ta_features = torchaudio.compliance.kaldi.mfcc( + torch.from_numpy(self.waveform.astype(self.dtype))) + pa_features = paddleaudio.compliance.kaldi.mfcc( + paddle.to_tensor(self.waveform.astype(self.dtype))) + np.testing.assert_array_almost_equal( + ta_features, pa_features, decimal=4) + + +if __name__ == '__main__': + unittest.main() diff --git a/audio/tests/features/test_librosa.py b/audio/tests/features/test_librosa.py new file mode 100644 index 000000000..19d094b4b --- /dev/null +++ b/audio/tests/features/test_librosa.py @@ -0,0 +1,281 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import librosa +import numpy as np +import paddle +import paddleaudio +from paddleaudio.functional.window import get_window + +from base import FeatTest + + +class TestLibrosa(FeatTest): + def initParmas(self): + self.n_fft = 512 + self.hop_length = 128 + self.n_mels = 40 + self.n_mfcc = 20 + self.fmin = 0.0 + self.window_str = 'hann' + self.pad_mode = 'reflect' + self.top_db = 80.0 + + def test_stft(self): + if len(self.waveform.shape) == 2: # (C, T) + self.waveform = self.waveform.squeeze( + 0) # 1D input for librosa.feature.melspectrogram + + feature_librosa = librosa.core.stft( + y=self.waveform, + n_fft=self.n_fft, + hop_length=self.hop_length, + win_length=None, + window=self.window_str, + center=True, + dtype=None, + pad_mode=self.pad_mode, ) + x = paddle.to_tensor(self.waveform).unsqueeze(0) + window = get_window(self.window_str, self.n_fft, dtype=x.dtype) + feature_paddle = paddle.signal.stft( + x=x, + n_fft=self.n_fft, + hop_length=self.hop_length, + win_length=None, + window=window, + center=True, + pad_mode=self.pad_mode, + normalized=False, + onesided=True, ).squeeze(0) + + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddle, decimal=5) + + def test_istft(self): + if len(self.waveform.shape) == 2: # (C, T) + self.waveform = self.waveform.squeeze( + 0) # 1D input for librosa.feature.melspectrogram + + # Get stft result from librosa. + stft_matrix = librosa.core.stft( + y=self.waveform, + n_fft=self.n_fft, + hop_length=self.hop_length, + win_length=None, + window=self.window_str, + center=True, + pad_mode=self.pad_mode, ) + + feature_librosa = librosa.core.istft( + stft_matrix=stft_matrix, + hop_length=self.hop_length, + win_length=None, + window=self.window_str, + center=True, + dtype=None, + length=None, ) + + x = paddle.to_tensor(stft_matrix).unsqueeze(0) + window = get_window( + self.window_str, + self.n_fft, + dtype=paddle.to_tensor(self.waveform).dtype) + feature_paddle = paddle.signal.istft( + x=x, + n_fft=self.n_fft, + hop_length=self.hop_length, + win_length=None, + window=window, + center=True, + normalized=False, + onesided=True, + length=None, + return_complex=False, ).squeeze(0) + + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddle, decimal=5) + + def test_mel(self): + feature_librosa = librosa.filters.mel( + sr=self.sr, + n_fft=self.n_fft, + n_mels=self.n_mels, + fmin=self.fmin, + fmax=None, + htk=False, + norm='slaney', + dtype=self.waveform.dtype, ) + feature_compliance = paddleaudio.compliance.librosa.compute_fbank_matrix( + sr=self.sr, + n_fft=self.n_fft, + n_mels=self.n_mels, + fmin=self.fmin, + fmax=None, + htk=False, + norm='slaney', + dtype=self.waveform.dtype, ) + x = paddle.to_tensor(self.waveform) + feature_functional = paddleaudio.functional.compute_fbank_matrix( + sr=self.sr, + n_fft=self.n_fft, + n_mels=self.n_mels, + f_min=self.fmin, + f_max=None, + htk=False, + norm='slaney', + dtype=x.dtype, ) + + np.testing.assert_array_almost_equal(feature_librosa, + feature_compliance) + np.testing.assert_array_almost_equal(feature_librosa, + feature_functional) + + def test_melspect(self): + if len(self.waveform.shape) == 2: # (C, T) + self.waveform = self.waveform.squeeze( + 0) # 1D input for librosa.feature.melspectrogram + + # librosa: + feature_librosa = librosa.feature.melspectrogram( + y=self.waveform, + sr=self.sr, + n_fft=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + fmin=self.fmin) + + # paddleaudio.compliance.librosa: + feature_compliance = paddleaudio.compliance.librosa.melspectrogram( + x=self.waveform, + sr=self.sr, + window_size=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + fmin=self.fmin, + to_db=False) + + # paddleaudio.features.layer + x = paddle.to_tensor( + self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim. + feature_extractor = paddleaudio.features.MelSpectrogram( + sr=self.sr, + n_fft=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + f_min=self.fmin, + dtype=x.dtype) + feature_layer = feature_extractor(x).squeeze(0).numpy() + + np.testing.assert_array_almost_equal( + feature_librosa, feature_compliance, decimal=5) + np.testing.assert_array_almost_equal( + feature_librosa, feature_layer, decimal=5) + + def test_log_melspect(self): + if len(self.waveform.shape) == 2: # (C, T) + self.waveform = self.waveform.squeeze( + 0) # 1D input for librosa.feature.melspectrogram + + # librosa: + feature_librosa = librosa.feature.melspectrogram( + y=self.waveform, + sr=self.sr, + n_fft=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + fmin=self.fmin) + feature_librosa = librosa.power_to_db(feature_librosa, top_db=None) + + # paddleaudio.compliance.librosa: + feature_compliance = paddleaudio.compliance.librosa.melspectrogram( + x=self.waveform, + sr=self.sr, + window_size=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + fmin=self.fmin) + + # paddleaudio.features.layer + x = paddle.to_tensor( + self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim. + feature_extractor = paddleaudio.features.LogMelSpectrogram( + sr=self.sr, + n_fft=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + f_min=self.fmin, + dtype=x.dtype) + feature_layer = feature_extractor(x).squeeze(0).numpy() + + np.testing.assert_array_almost_equal( + feature_librosa, feature_compliance, decimal=5) + np.testing.assert_array_almost_equal( + feature_librosa, feature_layer, decimal=4) + + def test_mfcc(self): + if len(self.waveform.shape) == 2: # (C, T) + self.waveform = self.waveform.squeeze( + 0) # 1D input for librosa.feature.melspectrogram + + # librosa: + feature_librosa = librosa.feature.mfcc( + y=self.waveform, + sr=self.sr, + S=None, + n_mfcc=self.n_mfcc, + dct_type=2, + norm='ortho', + lifter=0, + n_fft=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + fmin=self.fmin) + + # paddleaudio.compliance.librosa: + feature_compliance = paddleaudio.compliance.librosa.mfcc( + x=self.waveform, + sr=self.sr, + n_mfcc=self.n_mfcc, + dct_type=2, + norm='ortho', + lifter=0, + window_size=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + fmin=self.fmin, + top_db=self.top_db) + + # paddleaudio.features.layer + x = paddle.to_tensor( + self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim. + feature_extractor = paddleaudio.features.MFCC( + sr=self.sr, + n_mfcc=self.n_mfcc, + n_fft=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + f_min=self.fmin, + top_db=self.top_db, + dtype=x.dtype) + feature_layer = feature_extractor(x).squeeze(0).numpy() + + np.testing.assert_array_almost_equal( + feature_librosa, feature_compliance, decimal=4) + np.testing.assert_array_almost_equal( + feature_librosa, feature_layer, decimal=4) + + +if __name__ == '__main__': + unittest.main()