add *.py; wo unitest

pull/3900/head
drryanhuang 10 months ago
parent f2ca05e830
commit e243128c0b

@ -1,8 +1,9 @@
__version__ = "0.0.1"
from .core import AudioSignal
from .core import STFTParams
# from .core import Meter
from .core import Meter
from .core import util
from .core import highpass_filter, highpass_filters
from . import metrics
from . import data
from . import ml

@ -1,4 +1,15 @@
from . import util
from ._julius import fft_conv1d
from ._julius import FFTConv1d
from ._julius import highpass_filter
from ._julius import highpass_filters
from ._julius import lowpass_filter
from ._julius import LowPassFilter
from ._julius import LowPassFilters
from ._julius import pure_tone
from ._julius import split_bands
from ._julius import SplitBands
from .audio_signal import AudioSignal
from .audio_signal import STFTParams
from .loudness import Meter
from .resample import resample_frac

@ -0,0 +1,714 @@
# File under the MIT license, see https://github.com/adefossez/julius/LICENSE for details.
# Author: adefossez, 2020
"""
Implementation of a FFT based 1D convolution in PaddlePaddle.
While FFT is used in some cases for small kernel sizes, it is not the default for long ones, e.g. 512.
This module implements efficient FFT based convolutions for such cases. A typical
application is for evaluating FIR filters with a long receptive field, typically
evaluated with a stride of 1.
"""
import math
import typing
from typing import Optional
from typing import Sequence
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from .resample import sinc
def pad_to(tensor: paddle.Tensor,
target_length: int,
mode: str="constant",
value: float=0.0):
"""
Pad the given tensor to the given length, with 0s on the right.
"""
return F.pad(
tensor, (0, target_length - tensor.shape[-1]),
mode=mode,
value=value,
data_format="NCL")
def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):
"""
Return a pure tone, i.e. cosine.
Args:
freq (float): frequency (in Hz)
sr (float): sample rate (in Hz)
dur (float): duration (in seconds)
"""
time = paddle.arange(int(sr * dur), dtype="float32") / sr
return paddle.cos(2 * math.pi * freq * time)
def unfold(_input, kernel_size: int, stride: int):
"""1D only unfolding similar to the one from PyTorch.
However PyTorch unfold is extremely slow.
Given an _input tensor of size `[*, T]` this will return
a tensor `[*, F, K]` with `K` the kernel size, and `F` the number
of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`.
This will automatically pad the _input to cover at least once all entries in `_input`.
Args:
_input (Tensor): tensor for which to return the frames.
kernel_size (int): size of each frame.
stride (int): stride between each frame.
Shape:
- Inputs: `_input` is `[*, T]`
- Output: `[*, F, kernel_size]` with `F = 1 + ceil((T - kernel_size) / stride)`
..Warning:: unlike PyTorch unfold, this will pad the _input
so that any position in `_input` is covered by at least one frame.
"""
shape = list(_input.shape)
length = shape.pop(-1)
n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
tgt_length = (n_frames - 1) * stride + kernel_size
padded = F.pad(_input, (0, tgt_length - length), data_format="NCL")
strides: typing.List[int] = []
for dim in range(padded.dim()):
strides.append(padded.strides[dim])
assert strides.pop(-1) == 1, "data should be contiguous"
strides = strides + [stride, 1]
return padded.as_strided(shape + [n_frames, kernel_size], strides)
def _new_rfft(x: paddle.Tensor):
z = paddle.fft.rfft(x, axis=-1)
z_real = paddle.real(z)
z_imag = paddle.imag(z)
z_view_as_real = paddle.stack([z_real, z_imag], axis=-1)
return z_view_as_real
def _new_irfft(x: paddle.Tensor, length: int):
x_real = x[..., 0]
x_imag = x[..., 1]
x_view_as_complex = paddle.complex(x_real, x_imag)
return paddle.fft.irfft(x_view_as_complex, n=length, axis=-1)
def _compl_mul_conjugate(a: paddle.Tensor, b: paddle.Tensor):
"""
Given a and b two tensors of dimension 4
with the last dimension being the real and imaginary part,
returns a multiplied by the conjugate of b, the multiplication
being with respect to the second dimension.
PaddlePaddle does not have direct support for complex number operations
using einsum in the same manner as PyTorch, but we can manually compute
the equivalent result.
"""
# Extract the real and imaginary parts of a and b
real_a = a[..., 0]
imag_a = a[..., 1]
real_b = b[..., 0]
imag_b = b[..., 1]
# Compute the multiplication with respect to the second dimension manually
real_part = paddle.einsum("bcft,dct->bdft", real_a, real_b) + paddle.einsum(
"bcft,dct->bdft", imag_a, imag_b)
imag_part = paddle.einsum("bcft,dct->bdft", imag_a, real_b) - paddle.einsum(
"bcft,dct->bdft", real_a, imag_b)
# Stack the real and imaginary parts together
result = paddle.stack([real_part, imag_part], axis=-1)
return result
def fft_conv1d(
_input: paddle.Tensor,
weight: paddle.Tensor,
bias: Optional[paddle.Tensor]=None,
stride: int=1,
padding: int=0,
block_ratio: float=5, ):
"""
Same as `paddle.nn.functional.conv1d` but using FFT for the convolution.
Please check PaddlePaddle documentation for more information.
Args:
_input (Tensor): _input signal of shape `[B, C, T]`.
weight (Tensor): weight of the convolution `[D, C, K]` with `D` the number
of output channels.
bias (Tensor or None): if not None, bias term for the convolution.
stride (int): stride of convolution.
padding (int): padding to apply to the _input.
block_ratio (float): can be tuned for speed. The _input is splitted in chunks
with a size of `int(block_ratio * kernel_size)`.
Shape:
- Inputs: `_input` is `[B, C, T]`, `weight` is `[D, C, K]` and bias is `[D]`.
- Output: `(*, T)`
..note::
This function is faster than `paddle.nn.functional.conv1d` only in specific cases.
Typically, the kernel size should be of the order of 256 to see any real gain,
for a stride of 1.
..Warning::
Dilation and groups are not supported at the moment. This function might use
more memory than the default Conv1d implementation.
"""
_input = F.pad(_input, (padding, padding), data_format="NCL")
batch, channels, length = _input.shape
out_channels, _, kernel_size = weight.shape
if length < kernel_size:
raise RuntimeError(
f"Input should be at least as large as the kernel size {kernel_size}, "
f"but it is only {length} samples long.")
if block_ratio < 1:
raise RuntimeError("Block ratio must be greater than 1.")
block_size: int = min(int(kernel_size * block_ratio), length)
fold_stride = block_size - kernel_size + 1
weight = pad_to(weight, block_size)
weight_z = _new_rfft(weight)
# We pad the _input and get the different frames, on which
frames = unfold(_input, block_size, fold_stride)
frames_z = _new_rfft(frames)
out_z = _compl_mul_conjugate(frames_z, weight_z)
out = _new_irfft(out_z, block_size)
# The last bit is invalid, because FFT will do a circular convolution.
out = out[..., :-kernel_size + 1]
out = out.reshape([batch, out_channels, -1])
out = out[..., ::stride]
target_length = (length - kernel_size) // stride + 1
out = out[..., :target_length]
if bias is not None:
out += bias[:, None]
return out
class FFTConv1d(paddle.nn.Layer):
"""
Same as `paddle.nn.Conv1D` but based on a custom FFT-based convolution.
Please check PaddlePaddle documentation for more information on `paddle.nn.Conv1D`.
Args:
in_channels (int): number of _input channels.
out_channels (int): number of output channels.
kernel_size (int): kernel size of convolution.
stride (int): stride of convolution.
padding (int): padding to apply to the _input.
bias (bool): if True, use a bias term.
..note::
This module is faster than `paddle.nn.Conv1D` only in specific cases.
Typically, `kernel_size` should be of the order of 256 to see any real gain,
for a stride of 1.
..warning::
Dilation and groups are not supported at the moment. This module might use
more memory than the default Conv1D implementation.
>>> fftconv = FFTConv1d(12, 24, 128, 4)
>>> x = paddle.randn([4, 12, 1024])
>>> print(list(fftconv(x).shape))
[4, 24, 225]
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int,
stride: int=1,
padding: int=0,
bias: bool=True, ):
super(FFTConv1d, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
# Create a Conv1D layer to initialize weights and bias
conv = paddle.nn.Conv1D(
in_channels,
out_channels,
kernel_size,
stride=stride,
padding=padding,
bias_attr=bias)
self.weight = conv.weight
if bias:
self.bias = conv.bias
else:
self.bias = None
def forward(self, _input: paddle.Tensor):
return fft_conv1d(_input, self.weight, self.bias, self.stride,
self.padding)
class LowPassFilters(nn.Layer):
"""
Bank of low pass filters.
"""
def __init__(self,
cutoffs: Sequence[float],
stride: int=1,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None,
dtype="float32"):
super(LowPassFilters, self).__init__()
self.cutoffs = list(cutoffs)
if min(self.cutoffs) < 0:
raise ValueError("Minimum cutoff must be larger than zero.")
if max(self.cutoffs) > 0.5:
raise ValueError("A cutoff above 0.5 does not make sense.")
self.stride = stride
self.pad = pad
self.zeros = zeros
self.half_size = int(zeros / min([c for c in self.cutoffs if c > 0]) /
2)
if fft is None:
fft = self.half_size > 32
self.fft = fft
# Create filters
window = paddle.audio.functional.get_window(
"hann", 2 * self.half_size + 1, fftbins=False, dtype=dtype)
time = paddle.arange(
-self.half_size, self.half_size + 1, dtype="float32")
filters = []
for cutoff in cutoffs:
if cutoff == 0:
filter_ = paddle.zeros_like(time)
else:
filter_ = 2 * cutoff * window * sinc(2 * cutoff * math.pi *
time)
# Normalize filter
filter_ /= paddle.sum(filter_)
filters.append(filter_)
filters = paddle.stack(filters)[:, None]
self.filters = self.create_parameter(
shape=filters.shape,
default_initializer=nn.initializer.Constant(value=0.0),
dtype="float32",
is_bias=False,
attr=paddle.ParamAttr(trainable=False), )
self.filters.set_value(filters)
def forward(self, _input):
shape = list(_input.shape)
_input = _input.reshape([-1, 1, shape[-1]])
if self.pad:
_input = F.pad(
_input, (self.half_size, self.half_size),
mode="replicate",
data_format="NCL")
if self.fft:
out = fft_conv1d(_input, self.filters, stride=self.stride)
else:
out = F.conv1d(_input, self.filters, stride=self.stride)
shape.insert(0, len(self.cutoffs))
shape[-1] = out.shape[-1]
return out.transpose([1, 0, 2]).reshape(shape)
class LowPassFilter(nn.Layer):
"""
Same as `LowPassFilters` but applies a single low pass filter.
"""
def __init__(self,
cutoff: float,
stride: int=1,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None):
super(LowPassFilter, self).__init__()
self._lowpasses = LowPassFilters([cutoff], stride, pad, zeros, fft)
@property
def cutoff(self):
return self._lowpasses.cutoffs[0]
@property
def stride(self):
return self._lowpasses.stride
@property
def pad(self):
return self._lowpasses.pad
@property
def zeros(self):
return self._lowpasses.zeros
@property
def fft(self):
return self._lowpasses.fft
def forward(self, _input):
return self._lowpasses(_input)[0]
def lowpass_filters(
_input: paddle.Tensor,
cutoffs: Sequence[float],
stride: int=1,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None, ):
"""
Functional version of `LowPassFilters`, refer to this class for more information.
"""
return LowPassFilters(cutoffs, stride, pad, zeros, fft)(_input)
def lowpass_filter(_input: paddle.Tensor,
cutoff: float,
stride: int=1,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None):
"""
Same as `lowpass_filters` but with a single cutoff frequency.
Output will not have a dimension inserted in the front.
"""
return lowpass_filters(_input, [cutoff], stride, pad, zeros, fft)[0]
class HighPassFilters(paddle.nn.Layer):
"""
Bank of high pass filters. See `julius.lowpass.LowPassFilters` for more
details on the implementation.
Args:
cutoffs (list[float]): list of cutoff frequencies, in [0, 0.5] expressed as `f/f_s` where
f_s is the samplerate and `f` is the cutoff frequency.
The upper limit is 0.5, because a signal sampled at `f_s` contains only
frequencies under `f_s / 2`.
stride (int): how much to decimate the output. Probably not a good idea
to do so with a high pass filters though...
pad (bool): if True, appropriately pad the _input with zero over the edge. If `stride=1`,
the output will have the same length as the _input.
zeros (float): Number of zero crossings to keep.
Controls the receptive field of the Finite Impulse Response filter.
For filters with low cutoff frequency, e.g. 40Hz at 44.1kHz,
it is a bad idea to set this to a high value.
This is likely appropriate for most use. Lower values
will result in a faster filter, but with a slower attenuation around the
cutoff frequency.
fft (bool or None): if True, uses `julius.fftconv` rather than PyTorch convolutions.
If False, uses PyTorch convolutions. If None, either one will be chosen automatically
depending on the effective filter size.
..warning::
All the filters will use the same filter size, aligned on the lowest
frequency provided. If you combine a lot of filters with very diverse frequencies, it might
be more efficient to split them over multiple modules with similar frequencies.
Shape:
- Input: `[*, T]`
- Output: `[F, *, T']`, with `T'=T` if `pad` is True and `stride` is 1, and
`F` is the numer of cutoff frequencies.
>>> highpass = HighPassFilters([1/4])
>>> x = paddle.randn([4, 12, 21, 1024])
>>> list(highpass(x).shape)
[1, 4, 12, 21, 1024]
"""
def __init__(self,
cutoffs: Sequence[float],
stride: int=1,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None):
super().__init__()
self._lowpasses = LowPassFilters(cutoffs, stride, pad, zeros, fft)
@property
def cutoffs(self):
return self._lowpasses.cutoffs
@property
def stride(self):
return self._lowpasses.stride
@property
def pad(self):
return self._lowpasses.pad
@property
def zeros(self):
return self._lowpasses.zeros
@property
def fft(self):
return self._lowpasses.fft
def forward(self, _input):
lows = self._lowpasses(_input)
# We need to extract the right portion of the _input in case
# pad is False or stride > 1
if self.pad:
start, end = 0, _input.shape[-1]
else:
start = self._lowpasses.half_size
end = -start
_input = _input[..., start:end:self.stride]
highs = _input - lows
return highs
class HighPassFilter(paddle.nn.Layer):
"""
Same as `HighPassFilters` but applies a single high pass filter.
Shape:
- Input: `[*, T]`
- Output: `[*, T']`, with `T'=T` if `pad` is True and `stride` is 1.
>>> highpass = HighPassFilter(1/4, stride=1)
>>> x = paddle.randn([4, 124])
>>> list(highpass(x).shape)
[4, 124]
"""
def __init__(self,
cutoff: float,
stride: int=1,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None):
super().__init__()
self._highpasses = HighPassFilters([cutoff], stride, pad, zeros, fft)
@property
def cutoff(self):
return self._highpasses.cutoffs[0]
@property
def stride(self):
return self._highpasses.stride
@property
def pad(self):
return self._highpasses.pad
@property
def zeros(self):
return self._highpasses.zeros
@property
def fft(self):
return self._highpasses.fft
def forward(self, _input):
return self._highpasses(_input)[0]
def highpass_filters(
_input: paddle.Tensor,
cutoffs: Sequence[float],
stride: int=1,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None, ):
"""
Functional version of `HighPassFilters`, refer to this class for more information.
"""
return HighPassFilters(cutoffs, stride, pad, zeros, fft)(_input)
def highpass_filter(_input: paddle.Tensor,
cutoff: float,
stride: int=1,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None):
"""
Functional version of `HighPassFilter`, refer to this class for more information.
Output will not have a dimension inserted in the front.
"""
return highpass_filters(_input, [cutoff], stride, pad, zeros, fft)[0]
import paddle
from typing import Optional, Sequence
def hz_to_mel(freqs: paddle.Tensor):
"""
Converts a Tensor of frequencies in hertz to the mel scale.
Uses the simple formula by O'Shaughnessy (1987).
Args:
freqs (paddle.Tensor): frequencies to convert.
"""
return 2595 * paddle.log10(1 + freqs / 700)
def mel_to_hz(mels: paddle.Tensor):
"""
Converts a Tensor of mel scaled frequencies to Hertz.
Uses the simple formula by O'Shaughnessy (1987).
Args:
mels (paddle.Tensor): mel frequencies to convert.
"""
return 700 * (10**(mels / 2595) - 1)
def mel_frequencies(n_mels: int, fmin: float, fmax: float):
"""
Return frequencies that are evenly spaced in mel scale.
Args:
n_mels (int): number of frequencies to return.
fmin (float): start from this frequency (in Hz).
fmax (float): finish at this frequency (in Hz).
"""
low = hz_to_mel(paddle.to_tensor(float(fmin))).item()
high = hz_to_mel(paddle.to_tensor(float(fmax))).item()
mels = paddle.linspace(low, high, n_mels)
return mel_to_hz(mels)
class SplitBands(paddle.nn.Layer):
"""
Decomposes a signal over the given frequency bands in the waveform domain using
a cascade of low pass filters as implemented by `julius.lowpass.LowPassFilters`.
You can either specify explicitly the frequency cutoffs, or just the number of bands,
in which case the frequency cutoffs will be spread out evenly in mel scale.
Args:
sample_rate (float): Sample rate of the input signal in Hz.
n_bands (int or None): number of bands, when not giving them explicitly with `cutoffs`.
In that case, the cutoff frequencies will be evenly spaced in mel-space.
cutoffs (list[float] or None): list of frequency cutoffs in Hz.
pad (bool): if True, appropriately pad the input with zero over the edge. If `stride=1`,
the output will have the same length as the input.
zeros (float): Number of zero crossings to keep. See `LowPassFilters` for more informations.
fft (bool or None): See `LowPassFilters` for more info.
..note::
The sum of all the bands will always be the input signal.
..warning::
Unlike `julius.lowpass.LowPassFilters`, the cutoffs frequencies must be provided in Hz along
with the sample rate.
Shape:
- Input: `[*, T]`
- Output: `[B, *, T']`, with `T'=T` if `pad` is True.
If `n_bands` was provided, `B = n_bands` otherwise `B = len(cutoffs) + 1`
>>> bands = SplitBands(sample_rate=128, n_bands=10)
>>> x = paddle.randn(shape=[6, 4, 1024])
>>> list(bands(x).shape)
[10, 6, 4, 1024]
"""
def __init__(
self,
sample_rate: float,
n_bands: Optional[int]=None,
cutoffs: Optional[Sequence[float]]=None,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None, ):
super(SplitBands, self).__init__()
if (cutoffs is None) + (n_bands is None) != 1:
raise ValueError(
"You must provide either n_bands, or cutoffs, but not both.")
self.sample_rate = sample_rate
self.n_bands = n_bands
self._cutoffs = list(cutoffs) if cutoffs is not None else None
self.pad = pad
self.zeros = zeros
self.fft = fft
if cutoffs is None:
if n_bands is None:
raise ValueError("You must provide one of n_bands or cutoffs.")
if not n_bands >= 1:
raise ValueError(
f"n_bands must be greater than one (got {n_bands})")
cutoffs = mel_frequencies(n_bands + 1, 0, sample_rate / 2)[1:-1]
else:
if max(cutoffs) > 0.5 * sample_rate:
raise ValueError(
"A cutoff above sample_rate/2 does not make sense.")
if len(cutoffs) > 0:
self.lowpass = LowPassFilters(
[c / sample_rate for c in cutoffs],
pad=pad,
zeros=zeros,
fft=fft)
else:
self.lowpass = None # type: ignore
def forward(self, input):
if self.lowpass is None:
return input[None]
lows = self.lowpass(input)
low = lows[0]
bands = [low]
for low_and_band in lows[1:]:
# Get a bandpass filter by subtracting lowpasses
band = low_and_band - low
bands.append(band)
low = low_and_band
# Last band is whatever is left in the signal
bands.append(input - low)
return paddle.stack(bands)
@property
def cutoffs(self):
if self._cutoffs is not None:
return self._cutoffs
elif self.lowpass is not None:
return [c * self.sample_rate for c in self.lowpass.cutoffs]
else:
return []
def split_bands(
signal: paddle.Tensor,
sample_rate: float,
n_bands: Optional[int]=None,
cutoffs: Optional[Sequence[float]]=None,
pad: bool=True,
zeros: float=8,
fft: Optional[bool]=None, ):
"""
Functional version of `SplitBands`, refer to this class for more information.
>>> x = paddle.randn(shape=[6, 4, 1024])
>>> list(split_bands(x, sample_rate=64, cutoffs=[12, 24]).shape)
[3, 6, 4, 1024]
"""
return SplitBands(sample_rate, n_bands, cutoffs, pad, zeros, fft)(signal)

@ -14,15 +14,17 @@ import librosa
import numpy as np
import paddle
import soundfile
from . import util
from .dsp import DSPMixin
from .effects import EffectMixin
from .effects import ImpulseResponseMixin
from .ffmpeg import FFMPEGMixin
from .loudness import LoudnessMixin
from .resample import resample_frac
# from .display import DisplayMixin
# from .dsp import DSPMixin
# from .effects import EffectMixin
# from .effects import ImpulseResponseMixin
# from .ffmpeg import FFMPEGMixinx
# from loudness import LoudnessMixin
# from .playback import PlayMixin
# from .whisper import WhisperMixin
@ -89,13 +91,13 @@ STFTParams.__new__.__defaults__ = (None, None, None, None, None)
class AudioSignal(
# EffectMixin,
# LoudnessMixin,
EffectMixin,
LoudnessMixin,
# PlayMixin,
# ImpulseResponseMixin,
# DSPMixin,
ImpulseResponseMixin,
DSPMixin,
# DisplayMixin,
# FFMPEGMixin,
FFMPEGMixin,
# WhisperMixin,
):
"""This is the core object of this library. Audio is always
@ -525,7 +527,7 @@ class AudioSignal(
AudioSignal
AudioSignal loaded from file
"""
# need `ffmpeg`
data, sample_rate = librosa.load(
audio_path,
offset=offset,
@ -967,8 +969,7 @@ class AudioSignal(
def stft_data(self, data: typing.Union[paddle.Tensor, np.ndarray]):
if data is not None:
assert paddle.is_tensor(data) and paddle.is_complex(data)
if (self.stft_data is not None and
self.stft_data.shape != data.shape):
if self.stft_data is not None and self.stft_data.shape != data.shape:
warnings.warn("stft_data changed shape")
self._stft_data = data
return
@ -1139,8 +1140,7 @@ class AudioSignal(
length = self.signal_length
if match_stride:
assert (hop_length == window_length //
4), "For match_stride, hop must equal n_fft // 4"
assert hop_length == window_length // 4, "For match_stride, hop must equal n_fft // 4"
right_pad = math.ceil(length / hop_length) * hop_length - length
pad = (window_length - hop_length) // 2
else:
@ -1192,16 +1192,13 @@ class AudioSignal(
>>> signal.stft()
"""
window_length = (self.stft_params.window_length
if window_length is None else int(window_length))
hop_length = (self.stft_params.hop_length
if hop_length is None else int(hop_length))
window_type = (self.stft_params.window_type
if window_type is None else window_type)
match_stride = (self.stft_params.match_stride
if match_stride is None else match_stride)
padding_type = (self.stft_params.padding_type
if padding_type is None else padding_type)
window_length = self.stft_params.window_length if window_length is None else int(
window_length)
hop_length = self.stft_params.hop_length if hop_length is None else int(
hop_length)
window_type = self.stft_params.window_type if window_type is None else window_type
match_stride = self.stft_params.match_stride if match_stride is None else match_stride
padding_type = self.stft_params.padding_type if padding_type is None else padding_type
window = self.get_window(window_type, window_length)
# window = window.to(self.audio_data.device)
@ -1269,14 +1266,12 @@ class AudioSignal(
if self.stft_data is None:
raise RuntimeError("Cannot do inverse STFT without self.stft_data!")
window_length = (self.stft_params.window_length
if window_length is None else int(window_length))
hop_length = (self.stft_params.hop_length
if hop_length is None else int(hop_length))
window_type = (self.stft_params.window_type
if window_type is None else window_type)
match_stride = (self.stft_params.match_stride
if match_stride is None else match_stride)
window_length = self.stft_params.window_length if window_length is None else int(
window_length)
hop_length = self.stft_params.hop_length if hop_length is None else int(
hop_length)
window_type = self.stft_params.window_type if window_type is None else window_type
match_stride = self.stft_params.match_stride if match_stride is None else match_stride
window = self.get_window(window_type, window_length,
self.stft_data.place)
@ -1409,7 +1404,6 @@ class AudioSignal(
paddle.Tensor [shape=(n_mels, n_mfcc)] T
The dct transformation matrix.
"""
# from torchaudio.functional import create_dct
return create_dct(n_mfcc, n_mels, norm)
@ -1575,8 +1569,7 @@ class AudioSignal(
# Representation
def _info(self):
# ✅
dur = (f"{self.signal_duration:0.3f}"
if self.signal_duration else "[unknown]")
dur = f"{self.signal_duration:0.3f}" if self.signal_duration else "[unknown]"
info = {
"duration":
f"{dur} seconds",
@ -1654,6 +1647,16 @@ class AudioSignal(
def __eq__(self, other):
for k, v in list(self.__dict__.items()):
if paddle.is_tensor(v):
if paddle.is_complex(v):
if not np.allclose(
v.cpu().numpy(),
other.__dict__[k].cpu().numpy(),
atol=1e-6):
max_error = (v - other.__dict__[k]).abs().max()
print(f"Max abs error for {k}: {max_error}")
return False
else:
if not paddle.allclose(v, other.__dict__[k], atol=1e-6):
max_error = (v - other.__dict__[k]).abs().max()
print(f"Max abs error for {k}: {max_error}")
@ -1675,10 +1678,10 @@ class AudioSignal(
# Future work: make this work for time-indexing
# as well, using the hop length.
audio_data = self.audio_data[key]
_loudness = (self._loudness[key]
if self._loudness is not None else None)
stft_data = (self.stft_data[key]
if self.stft_data is not None else None)
_loudness = self._loudness[
key] if self._loudness is not None else None
stft_data = self.stft_data[
key] if self.stft_data is not None else None
sources = None
@ -1707,6 +1710,11 @@ class AudioSignal(
if self.audio_data is not None and value.audio_data is not None:
self.audio_data[key] = value.audio_data
if self._loudness is not None and value._loudness is not None:
if paddle.is_tensor(key) and key.dtype == paddle.bool:
# FOR Paddle BOOL Index
_key_no_bool = paddle.nonzero(key).flatten()
self._loudness[_key_no_bool] = value._loudness
else:
self._loudness[key] = value._loudness
if self.stft_data is not None and value.stft_data is not None:
self.stft_data[key] = value.stft_data

@ -0,0 +1,390 @@
import typing
import numpy as np
import paddle
from . import _julius
from . import util
class DSPMixin:
_original_batch_size = None
_original_num_channels = None
_padded_signal_length = None
# def _preprocess_signal_for_windowing(self, window_duration, hop_duration):
# self._original_batch_size = self.batch_size
# self._original_num_channels = self.num_channels
# window_length = int(window_duration * self.sample_rate)
# hop_length = int(hop_duration * self.sample_rate)
# if window_length % hop_length != 0:
# factor = window_length // hop_length
# window_length = factor * hop_length
# self.zero_pad(hop_length, hop_length)
# self._padded_signal_length = self.signal_length
# return window_length, hop_length
# def windows(
# self, window_duration: float, hop_duration: float, preprocess: bool = True
# ):
# """Generator which yields windows of specified duration from signal with a specified
# hop length.
# Parameters
# ----------
# window_duration : float
# Duration of every window in seconds.
# hop_duration : float
# Hop between windows in seconds.
# preprocess : bool, optional
# Whether to preprocess the signal, so that the first sample is in
# the middle of the first window, by default True
# Yields
# ------
# AudioSignal
# Each window is returned as an AudioSignal.
# """
# if preprocess:
# window_length, hop_length = self._preprocess_signal_for_windowing(
# window_duration, hop_duration
# )
# self.audio_data = self.audio_data.reshape(-1, 1, self.signal_length)
# for b in range(self.batch_size):
# i = 0
# start_idx = i * hop_length
# while True:
# start_idx = i * hop_length
# i += 1
# end_idx = start_idx + window_length
# if end_idx > self.signal_length:
# break
# yield self[b, ..., start_idx:end_idx]
# def collect_windows(
# self, window_duration: float, hop_duration: float, preprocess: bool = True
# ):
# """Reshapes signal into windows of specified duration from signal with a specified
# hop length. Window are placed along the batch dimension. Use with
# :py:func:`audiotools.core.dsp.DSPMixin.overlap_and_add` to reconstruct the
# original signal.
# Parameters
# ----------
# window_duration : float
# Duration of every window in seconds.
# hop_duration : float
# Hop between windows in seconds.
# preprocess : bool, optional
# Whether to preprocess the signal, so that the first sample is in
# the middle of the first window, by default True
# Returns
# -------
# AudioSignal
# AudioSignal unfolded with shape ``(nb * nch * num_windows, 1, window_length)``
# """
# if preprocess:
# window_length, hop_length = self._preprocess_signal_for_windowing(
# window_duration, hop_duration
# )
# # self.audio_data: (nb, nch, nt).
# unfolded = paddle.nn.functional.unfold(
# self.audio_data.reshape(-1, 1, 1, self.signal_length),
# kernel_size=(1, window_length),
# stride=(1, hop_length),
# )
# # unfolded: (nb * nch, window_length, num_windows).
# # -> (nb * nch * num_windows, 1, window_length)
# unfolded = unfolded.permute(0, 2, 1).reshape(-1, 1, window_length)
# self.audio_data = unfolded
# return self
# def overlap_and_add(self, hop_duration: float):
# """Function which takes a list of windows and overlap adds them into a
# signal the same length as ``audio_signal``.
# Parameters
# ----------
# hop_duration : float
# How much to shift for each window
# (overlap is window_duration - hop_duration) in seconds.
# Returns
# -------
# AudioSignal
# overlap-and-added signal.
# """
# hop_length = int(hop_duration * self.sample_rate)
# window_length = self.signal_length
# nb, nch = self._original_batch_size, self._original_num_channels
# unfolded = self.audio_data.reshape(nb * nch, -1, window_length).permute(0, 2, 1)
# folded = paddle.nn.functional.fold(
# unfolded,
# output_size=(1, self._padded_signal_length),
# kernel_size=(1, window_length),
# stride=(1, hop_length),
# )
# norm = paddle.ones_like(unfolded, device=unfolded.device)
# norm = paddle.nn.functional.fold(
# norm,
# output_size=(1, self._padded_signal_length),
# kernel_size=(1, window_length),
# stride=(1, hop_length),
# )
# folded = folded / norm
# folded = folded.reshape(nb, nch, -1)
# self.audio_data = folded
# self.trim(hop_length, hop_length)
# return self
def low_pass(self,
cutoffs: typing.Union[paddle.Tensor, np.ndarray, float],
zeros: int=51):
"""Low-passes the signal in-place. Each item in the batch
can have a different low-pass cutoff, if the input
to this signal is an array or tensor. If a float, all
items are given the same low-pass filter.
Parameters
----------
cutoffs : typing.Union[paddle.Tensor, np.ndarray, float]
Cutoff in Hz of low-pass filter.
zeros : int, optional
Number of taps to use in low-pass filter, by default 51
Returns
-------
AudioSignal
Low-passed AudioSignal.
"""
cutoffs = util.ensure_tensor(cutoffs, 2, self.batch_size)
cutoffs = cutoffs / self.sample_rate
filtered = paddle.empty_like(self.audio_data)
for i, cutoff in enumerate(cutoffs):
lp_filter = _julius.LowPassFilter(cutoff.cpu(), zeros=zeros)
filtered[i] = lp_filter(self.audio_data[i])
self.audio_data = filtered
self.stft_data = None
return self
def high_pass(self,
cutoffs: typing.Union[paddle.Tensor, np.ndarray, float],
zeros: int=51):
"""High-passes the signal in-place. Each item in the batch
can have a different high-pass cutoff, if the input
to this signal is an array or tensor. If a float, all
items are given the same high-pass filter.
Parameters
----------
cutoffs : typing.Union[paddle.Tensor, np.ndarray, float]
Cutoff in Hz of high-pass filter.
zeros : int, optional
Number of taps to use in high-pass filter, by default 51
Returns
-------
AudioSignal
High-passed AudioSignal.
"""
cutoffs = util.ensure_tensor(cutoffs, 2, self.batch_size)
cutoffs = cutoffs / self.sample_rate
filtered = paddle.empty_like(self.audio_data)
for i, cutoff in enumerate(cutoffs):
hp_filter = _julius.HighPassFilter(cutoff.cpu(), zeros=zeros)
filtered[i] = hp_filter(self.audio_data[i])
self.audio_data = filtered
self.stft_data = None
return self
# def mask_frequencies(
# self,
# fmin_hz: typing.Union[paddle.Tensor, np.ndarray, float],
# fmax_hz: typing.Union[paddle.Tensor, np.ndarray, float],
# val: float = 0.0,
# ):
# """Masks frequencies between ``fmin_hz`` and ``fmax_hz``, and fills them
# with the value specified by ``val``. Useful for implementing SpecAug.
# The min and max can be different for every item in the batch.
# Parameters
# ----------
# fmin_hz : typing.Union[paddle.Tensor, np.ndarray, float]
# Lower end of band to mask out.
# fmax_hz : typing.Union[paddle.Tensor, np.ndarray, float]
# Upper end of band to mask out.
# val : float, optional
# Value to fill in, by default 0.0
# Returns
# -------
# AudioSignal
# Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
# masked audio data.
# """
# # SpecAug
# mag, phase = self.magnitude, self.phase
# fmin_hz = util.ensure_tensor(fmin_hz, ndim=mag.ndim)
# fmax_hz = util.ensure_tensor(fmax_hz, ndim=mag.ndim)
# assert paddle.all(fmin_hz < fmax_hz)
# # build mask
# nbins = mag.shape[-2]
# bins_hz = paddle.linspace(0, self.sample_rate / 2, nbins, device=self.device)
# bins_hz = bins_hz[None, None, :, None].repeat(
# self.batch_size, 1, 1, mag.shape[-1]
# )
# mask = (fmin_hz <= bins_hz) & (bins_hz < fmax_hz)
# mask = mask.to(self.device)
# mag = mag.masked_fill(mask, val)
# phase = phase.masked_fill(mask, val)
# self.stft_data = mag * paddle.exp(1j * phase)
# return self
# def mask_timesteps(
# self,
# tmin_s: typing.Union[paddle.Tensor, np.ndarray, float],
# tmax_s: typing.Union[paddle.Tensor, np.ndarray, float],
# val: float = 0.0,
# ):
# """Masks timesteps between ``tmin_s`` and ``tmax_s``, and fills them
# with the value specified by ``val``. Useful for implementing SpecAug.
# The min and max can be different for every item in the batch.
# Parameters
# ----------
# tmin_s : typing.Union[paddle.Tensor, np.ndarray, float]
# Lower end of timesteps to mask out.
# tmax_s : typing.Union[paddle.Tensor, np.ndarray, float]
# Upper end of timesteps to mask out.
# val : float, optional
# Value to fill in, by default 0.0
# Returns
# -------
# AudioSignal
# Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
# masked audio data.
# """
# # SpecAug
# mag, phase = self.magnitude, self.phase
# tmin_s = util.ensure_tensor(tmin_s, ndim=mag.ndim)
# tmax_s = util.ensure_tensor(tmax_s, ndim=mag.ndim)
# assert paddle.all(tmin_s < tmax_s)
# # build mask
# nt = mag.shape[-1]
# bins_t = paddle.linspace(0, self.signal_duration, nt, device=self.device)
# bins_t = bins_t[None, None, None, :].repeat(
# self.batch_size, 1, mag.shape[-2], 1
# )
# mask = (tmin_s <= bins_t) & (bins_t < tmax_s)
# mag = mag.masked_fill(mask, val)
# phase = phase.masked_fill(mask, val)
# self.stft_data = mag * paddle.exp(1j * phase)
# return self
# def mask_low_magnitudes(
# self, db_cutoff: typing.Union[paddle.Tensor, np.ndarray, float], val: float = 0.0
# ):
# """Mask away magnitudes below a specified threshold, which
# can be different for every item in the batch.
# Parameters
# ----------
# db_cutoff : typing.Union[paddle.Tensor, np.ndarray, float]
# Decibel value for which things below it will be masked away.
# val : float, optional
# Value to fill in for masked portions, by default 0.0
# Returns
# -------
# AudioSignal
# Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
# masked audio data.
# """
# mag = self.magnitude
# log_mag = self.log_magnitude()
# db_cutoff = util.ensure_tensor(db_cutoff, ndim=mag.ndim)
# mask = log_mag < db_cutoff
# mag = mag.masked_fill(mask, val)
# self.magnitude = mag
# return self
# def shift_phase(self, shift: typing.Union[paddle.Tensor, np.ndarray, float]):
# """Shifts the phase by a constant value.
# Parameters
# ----------
# shift : typing.Union[paddle.Tensor, np.ndarray, float]
# What to shift the phase by.
# Returns
# -------
# AudioSignal
# Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
# masked audio data.
# """
# shift = util.ensure_tensor(shift, ndim=self.phase.ndim)
# self.phase = self.phase + shift
# return self
# def corrupt_phase(self, scale: typing.Union[paddle.Tensor, np.ndarray, float]):
# """Corrupts the phase randomly by some scaled value.
# Parameters
# ----------
# scale : typing.Union[paddle.Tensor, np.ndarray, float]
# Standard deviation of noise to add to the phase.
# Returns
# -------
# AudioSignal
# Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
# masked audio data.
# """
# scale = util.ensure_tensor(scale, ndim=self.phase.ndim)
# self.phase = self.phase + scale * paddle.randn_like(self.phase)
# return self
# def preemphasis(self, coef: float = 0.85):
# """Applies pre-emphasis to audio signal.
# Parameters
# ----------
# coef : float, optional
# How much pre-emphasis to apply, lower values do less. 0 does nothing.
# by default 0.85
# Returns
# -------
# AudioSignal
# Pre-emphasized signal.
# """
# kernel = paddle.to_tensor([1, -coef, 0]).view(1, 1, -1).to(self.device)
# x = self.audio_data.reshape(-1, 1, self.signal_length)
# x = paddle.nn.functional.conv1d(x, kernel, padding=1)
# self.audio_data = x.reshape(*self.audio_data.shape)
# return self

@ -0,0 +1,681 @@
import typing
import numpy as np
import paddle
from . import util
from ._julius import SplitBands
# from . import _julius
class EffectMixin:
GAIN_FACTOR = np.log(10) / 20
"""Gain factor for converting between amplitude and decibels."""
CODEC_PRESETS = {
"8-bit": {
"format": "wav",
"encoding": "ULAW",
"bits_per_sample": 8
},
"GSM-FR": {
"format": "gsm"
},
"MP3": {
"format": "mp3",
"compression": -9
},
"Vorbis": {
"format": "vorbis",
"compression": -1
},
"Ogg": {
"format": "ogg",
"compression": -1,
},
"Amr-nb": {
"format": "amr-nb"
},
}
"""Presets for applying codecs via torchaudio."""
def mix(
self,
other,
snr: typing.Union[paddle.Tensor, np.ndarray, float]=10,
other_eq: typing.Union[paddle.Tensor, np.ndarray]=None, ):
"""Mixes noise with signal at specified
signal-to-noise ratio. Optionally, the
other signal can be equalized in-place.
Parameters
----------
other : AudioSignal
AudioSignal object to mix with.
snr : typing.Union[paddle.Tensor, np.ndarray, float], optional
Signal to noise ratio, by default 10
other_eq : typing.Union[paddle.Tensor, np.ndarray], optional
EQ curve to apply to other signal, if any, by default None
Returns
-------
AudioSignal
In-place modification of AudioSignal.
"""
snr = util.ensure_tensor(snr)
pad_len = max(0, self.signal_length - other.signal_length)
other.zero_pad(0, pad_len)
other.truncate_samples(self.signal_length)
if other_eq is not None:
other = other.equalizer(other_eq)
tgt_loudness = self.loudness() - snr
other = other.normalize(tgt_loudness)
self.audio_data = self.audio_data + other.audio_data
return self
def convolve(self, other, start_at_max: bool=True):
"""Convolves self with other.
This function uses FFTs to do the convolution.
Parameters
----------
other : AudioSignal
Signal to convolve with.
start_at_max : bool, optional
Whether to start at the max value of other signal, to
avoid inducing delays, by default True
Returns
-------
AudioSignal
Convolved signal, in-place.
"""
from . import AudioSignal
pad_len = self.signal_length - other.signal_length
if pad_len > 0:
other.zero_pad(0, pad_len)
else:
other.truncate_samples(self.signal_length)
if start_at_max:
# Use roll to rotate over the max for every item
# so that the impulse responses don't induce any
# delay.
idx = paddle.argmax(paddle.abs(other.audio_data), axis=-1)
irs = paddle.zeros_like(other.audio_data)
for i in range(other.batch_size):
irs[i] = paddle.roll(
other.audio_data[i], shifts=-idx[i].item(), axis=-1)
other = AudioSignal(irs, other.sample_rate)
delta = paddle.zeros_like(other.audio_data)
delta[..., 0] = 1
length = self.signal_length
delta_fft = paddle.fft.rfft(delta, n=length)
other_fft = paddle.fft.rfft(other.audio_data, n=length)
self_fft = paddle.fft.rfft(self.audio_data, n=length)
convolved_fft = other_fft * self_fft
convolved_audio = paddle.fft.irfft(convolved_fft, n=length)
delta_convolved_fft = other_fft * delta_fft
delta_audio = paddle.fft.irfft(delta_convolved_fft, n=length)
# Use the delta to rescale the audio exactly as needed.
delta_max = paddle.max(paddle.abs(delta_audio), axis=-1, keepdim=True)
scale = 1 / paddle.clip(delta_max, min=1e-5)
convolved_audio = convolved_audio * scale
self.audio_data = convolved_audio
return self
def apply_ir(
self,
ir,
drr: typing.Union[paddle.Tensor, np.ndarray, float]=None,
ir_eq: typing.Union[paddle.Tensor, np.ndarray]=None,
use_original_phase: bool=False, ):
"""Applies an impulse response to the signal. If ` is`ir_eq``
is specified, the impulse response is equalized before
it is applied, using the given curve.
Parameters
----------
ir : AudioSignal
Impulse response to convolve with.
drr : typing.Union[paddle.Tensor, np.ndarray, float], optional
Direct-to-reverberant ratio that impulse response will be
altered to, if specified, by default None
ir_eq : typing.Union[paddle.Tensor, np.ndarray], optional
Equalization that will be applied to impulse response
if specified, by default None
use_original_phase : bool, optional
Whether to use the original phase, instead of the convolved
phase, by default False
Returns
-------
AudioSignal
Signal with impulse response applied to it
"""
if ir_eq is not None:
ir = ir.equalizer(ir_eq)
if drr is not None:
ir = ir.alter_drr(drr)
# Save the peak before
max_spk = self.audio_data.abs().max(axis=-1, keepdim=True)
# Augment the impulse response to simulate microphone effects
# and with varying direct-to-reverberant ratio.
phase = self.phase
self.convolve(ir)
# Use the input phase
if use_original_phase:
self.stft()
self.stft_data = self.magnitude * paddle.exp(1j * phase)
self.istft()
# Rescale to the input's amplitude
max_transformed = self.audio_data.abs().max(axis=-1, keepdim=True)
scale_factor = max_spk.clip(1e-8) / max_transformed.clip(1e-8)
self = self * scale_factor
return self
def ensure_max_of_audio(self, _max: float=1.0):
"""Ensures that ``abs(audio_data) <= max``.
Parameters
----------
max : float, optional
Max absolute value of signal, by default 1.0
Returns
-------
AudioSignal
Signal with values scaled between -max and max.
"""
peak = self.audio_data.abs().max(axis=-1, keepdim=True)
peak_gain = paddle.ones_like(peak)
peak_gain[peak > _max] = _max / peak[peak > _max]
self.audio_data = self.audio_data * peak_gain
return self
def normalize(self,
db: typing.Union[paddle.Tensor, np.ndarray, float]=-24.0):
"""Normalizes the signal's volume to the specified db, in LUFS.
This is GPU-compatible, making for very fast loudness normalization.
Parameters
----------
db : typing.Union[paddle.Tensor, np.ndarray, float], optional
Loudness to normalize to, by default -24.0
Returns
-------
AudioSignal
Normalized audio signal.
"""
db = util.ensure_tensor(db)
ref_db = self.loudness()
gain = db - ref_db
gain = paddle.exp(gain * self.GAIN_FACTOR)
self.audio_data = self.audio_data * gain[:, None, None]
return self
# def volume_change(self, db: typing.Union[paddle.Tensor, np.ndarray, float]):
# """Change volume of signal by some amount, in dB.
# Parameters
# ----------
# db : typing.Union[paddle.Tensor, np.ndarray, float]
# Amount to change volume by.
# Returns
# -------
# AudioSignal
# Signal at new volume.
# """
# db = util.ensure_tensor(db, ndim=1).to(self.device)
# gain = torch.exp(db * self.GAIN_FACTOR)
# self.audio_data = self.audio_data * gain[:, None, None]
# return self
# def _to_2d(self):
# waveform = self.audio_data.reshape(-1, self.signal_length)
# return waveform
# def _to_3d(self, waveform):
# return waveform.reshape(self.batch_size, self.num_channels, -1)
# def pitch_shift(self, n_semitones: int, quick: bool = True):
# """Pitch shift the signal. All items in the batch
# get the same pitch shift.
# Parameters
# ----------
# n_semitones : int
# How many semitones to shift the signal by.
# quick : bool, optional
# Using quick pitch shifting, by default True
# Returns
# -------
# AudioSignal
# Pitch shifted audio signal.
# """
# device = self.device
# effects = [
# ["pitch", str(n_semitones * 100)],
# ["rate", str(self.sample_rate)],
# ]
# if quick:
# effects[0].insert(1, "-q")
# waveform = self._to_2d().cpu()
# waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
# waveform, self.sample_rate, effects, channels_first=True
# )
# self.sample_rate = sample_rate
# self.audio_data = self._to_3d(waveform)
# return self.to(device)
# def time_stretch(self, factor: float, quick: bool = True):
# """Time stretch the audio signal.
# Parameters
# ----------
# factor : float
# Factor by which to stretch the AudioSignal. Typically
# between 0.8 and 1.2.
# quick : bool, optional
# Whether to use quick time stretching, by default True
# Returns
# -------
# AudioSignal
# Time-stretched AudioSignal.
# """
# device = self.device
# effects = [
# ["tempo", str(factor)],
# ["rate", str(self.sample_rate)],
# ]
# if quick:
# effects[0].insert(1, "-q")
# waveform = self._to_2d().cpu()
# waveform, sample_rate = torchaudio.sox_effects.apply_effects_tensor(
# waveform, self.sample_rate, effects, channels_first=True
# )
# self.sample_rate = sample_rate
# self.audio_data = self._to_3d(waveform)
# return self.to(device)
# def apply_codec(
# self,
# preset: str = None,
# format: str = "wav",
# encoding: str = None,
# bits_per_sample: int = None,
# compression: int = None,
# ): # pragma: no cover
# """Applies an audio codec to the signal.
# Parameters
# ----------
# preset : str, optional
# One of the keys in ``self.CODEC_PRESETS``, by default None
# format : str, optional
# Format for audio codec, by default "wav"
# encoding : str, optional
# Encoding to use, by default None
# bits_per_sample : int, optional
# How many bits per sample, by default None
# compression : int, optional
# Compression amount of codec, by default None
# Returns
# -------
# AudioSignal
# AudioSignal with codec applied.
# Raises
# ------
# ValueError
# If preset is not in ``self.CODEC_PRESETS``, an error
# is thrown.
# """
# torchaudio_version_070 = "0.7" in torchaudio.__version__
# if torchaudio_version_070:
# return self
# kwargs = {
# "format": format,
# "encoding": encoding,
# "bits_per_sample": bits_per_sample,
# "compression": compression,
# }
# if preset is not None:
# if preset in self.CODEC_PRESETS:
# kwargs = self.CODEC_PRESETS[preset]
# else:
# raise ValueError(
# f"Unknown preset: {preset}. "
# f"Known presets: {list(self.CODEC_PRESETS.keys())}"
# )
# waveform = self._to_2d()
# if kwargs["format"] in ["vorbis", "mp3", "ogg", "amr-nb"]:
# # Apply it in a for loop
# augmented = torch.cat(
# [
# torchaudio.functional.apply_codec(
# waveform[i][None, :], self.sample_rate, **kwargs
# )
# for i in range(waveform.shape[0])
# ],
# dim=0,
# )
# else:
# augmented = torchaudio.functional.apply_codec(
# waveform, self.sample_rate, **kwargs
# )
# augmented = self._to_3d(augmented)
# self.audio_data = augmented
# return self
def mel_filterbank(self, n_bands: int):
"""Breaks signal into mel bands.
Parameters
----------
n_bands : int
Number of mel bands to use.
Returns
-------
paddle.Tensor
Mel-filtered bands, with last axis being the band index.
"""
filterbank = SplitBands(self.sample_rate, n_bands).float()
filtered = filterbank(self.audio_data)
return filtered.transpose([1, 2, 3, 0])
def equalizer(self, db: typing.Union[paddle.Tensor, np.ndarray]):
"""Applies a mel-spaced equalizer to the audio signal.
Parameters
----------
db : typing.Union[paddle.Tensor, np.ndarray]
EQ curve to apply.
Returns
-------
AudioSignal
AudioSignal with equalization applied.
"""
db = util.ensure_tensor(db)
n_bands = db.shape[-1]
fbank = self.mel_filterbank(n_bands)
# If there's a batch dimension, make sure it's the same.
if db.ndim == 2:
if db.shape[0] != 1:
assert db.shape[0] == fbank.shape[0]
else:
db = db.unsqueeze(0)
weights = (10**db).astype("float32")
fbank = fbank * weights[:, None, None, :]
eq_audio_data = fbank.sum(-1)
self.audio_data = eq_audio_data
return self
def clip_distortion(
self,
clip_percentile: typing.Union[paddle.Tensor, np.ndarray, float]):
"""Clips the signal at a given percentile. The higher it is,
the lower the threshold for clipping.
Parameters
----------
clip_percentile : typing.Union[paddle.Tensor, np.ndarray, float]
Values are between 0.0 to 1.0. Typical values are 0.1 or below.
Returns
-------
AudioSignal
Audio signal with clipped audio data.
"""
clip_percentile = util.ensure_tensor(clip_percentile, ndim=1)
clip_percentile = clip_percentile.item()
min_thresh = paddle.quantile(
self.audio_data, clip_percentile / 2, axis=-1)[None]
max_thresh = paddle.quantile(
self.audio_data, 1 - (clip_percentile / 2), axis=-1)[None]
nc = self.audio_data.shape[1]
min_thresh = min_thresh[:, :nc, :]
max_thresh = max_thresh[:, :nc, :]
self.audio_data = self.audio_data.clip(min_thresh, max_thresh)
return self
# def quantization(
# self, quantization_channels: typing.Union[paddle.Tensor, np.ndarray, int]
# ):
# """Applies quantization to the input waveform.
# Parameters
# ----------
# quantization_channels : typing.Union[paddle.Tensor, np.ndarray, int]
# Number of evenly spaced quantization channels to quantize
# to.
# Returns
# -------
# AudioSignal
# Quantized AudioSignal.
# """
# quantization_channels = util.ensure_tensor(quantization_channels, ndim=3)
# x = self.audio_data
# x = (x + 1) / 2
# x = x * quantization_channels
# x = x.floor()
# x = x / quantization_channels
# x = 2 * x - 1
# residual = (self.audio_data - x).detach()
# self.audio_data = self.audio_data - residual
# return self
# def mulaw_quantization(
# self, quantization_channels: typing.Union[paddle.Tensor, np.ndarray, int]
# ):
# """Applies mu-law quantization to the input waveform.
# Parameters
# ----------
# quantization_channels : typing.Union[paddle.Tensor, np.ndarray, int]
# Number of mu-law spaced quantization channels to quantize
# to.
# Returns
# -------
# AudioSignal
# Quantized AudioSignal.
# """
# mu = quantization_channels - 1.0
# mu = util.ensure_tensor(mu, ndim=3)
# x = self.audio_data
# # quantize
# x = torch.sign(x) * torch.log1p(mu * torch.abs(x)) / torch.log1p(mu)
# x = ((x + 1) / 2 * mu + 0.5).to(torch.int64)
# # unquantize
# x = (x / mu) * 2 - 1.0
# x = torch.sign(x) * (torch.exp(torch.abs(x) * torch.log1p(mu)) - 1.0) / mu
# residual = (self.audio_data - x).detach()
# self.audio_data = self.audio_data - residual
# return self
# def __matmul__(self, other):
# return self.convolve(other)
import paddle
import typing
import numpy as np
class ImpulseResponseMixin:
"""These functions are generally only used with AudioSignals that are derived
from impulse responses, not other sources like music or speech. These methods
are used to replicate the data augmentation described in [1].
1. Bryan, Nicholas J. "Impulse response data augmentation and deep
neural networks for blind room acoustic parameter estimation."
ICASSP 2020-2020 IEEE International Conference on Acoustics,
Speech and Signal Processing (ICASSP). IEEE, 2020.
"""
def decompose_ir(self):
"""Decomposes an impulse response into early and late
field responses.
"""
# Equations 1 and 2
# -----------------
# Breaking up into early
# response + late field response.
td = paddle.argmax(self.audio_data, axis=-1, keepdim=True)
t0 = int(self.sample_rate * 0.0025)
idx = paddle.arange(self.audio_data.shape[-1])[None, None, :]
idx = idx.expand([self.batch_size, -1, -1])
early_idx = (idx >= td - t0) * (idx <= td + t0)
early_response = paddle.zeros_like(self.audio_data)
# early_response[early_idx] = self.audio_data[early_idx]
early_response = paddle.where(early_idx, self.audio_data,
early_response)
late_idx = ~early_idx
late_field = paddle.zeros_like(self.audio_data)
# late_field[late_idx] = self.audio_data[late_idx]
late_field = paddle.where(late_idx, self.audio_data, late_field)
# Equation 4
# ----------
# Decompose early response into windowed
# direct path and windowed residual.
window = paddle.zeros_like(self.audio_data)
for idx in range(self.batch_size):
window_idx = early_idx[idx, 0]
# ----- Just for this -----
# window[idx, ..., window_idx] = self.get_window("hann", window_idx.sum().item())
indices = paddle.nonzero(window_idx).reshape(
[-1]) # shape: [num_true], dtype: int64
temp_window = self.get_window("hann", indices.shape[0])
window_slice = window[idx, 0]
updated_window_slice = paddle.scatter(
window_slice, index=indices, updates=temp_window)
window[idx, 0] = updated_window_slice
# ----- Just for that -----
return early_response, late_field, window
def measure_drr(self):
"""Measures the direct-to-reverberant ratio of the impulse
response.
Returns
-------
float
Direct-to-reverberant ratio
"""
early_response, late_field, _ = self.decompose_ir()
num = (early_response**2).sum(axis=-1)
den = (late_field**2).sum(axis=-1)
drr = 10 * paddle.log10(num / den)
return drr
@staticmethod
def solve_alpha(early_response, late_field, wd, target_drr):
"""Used to solve for the alpha value, which is used
to alter the drr.
"""
# Equation 5
# ----------
# Apply the good ol' quadratic formula.
wd_sq = wd**2
wd_sq_1 = (1 - wd)**2
e_sq = early_response**2
l_sq = late_field**2
a = (wd_sq * e_sq).sum(axis=-1)
b = (2 * (1 - wd) * wd * e_sq).sum(axis=-1)
c = (wd_sq_1 * e_sq).sum(axis=-1) - paddle.pow(
10 * paddle.ones_like(target_drr), target_drr / 10) * l_sq.sum(
axis=-1)
expr = ((b**2) - 4 * a * c).sqrt()
alpha = paddle.maximum(
(-b - expr) / (2 * a),
(-b + expr) / (2 * a), )
return alpha
def alter_drr(self, drr: typing.Union[paddle.Tensor, np.ndarray, float]):
"""Alters the direct-to-reverberant ratio of the impulse response.
Parameters
----------
drr : typing.Union[paddle.Tensor, np.ndarray, float]
Direct-to-reverberant ratio that impulse response will be
altered to, if specified, by default None
Returns
-------
AudioSignal
Altered impulse response.
"""
drr = util.ensure_tensor(
drr, 2, self.batch_size
) # Assuming util.ensure_tensor is adapted or equivalent exists
early_response, late_field, window = self.decompose_ir()
alpha = self.solve_alpha(early_response, late_field, window, drr)
min_alpha = late_field.abs().max(axis=-1)[0] / early_response.abs().max(
axis=-1)[0]
alpha = paddle.maximum(alpha, min_alpha)[..., None]
aug_ir_data = alpha * window * early_response + (
(1 - window) * early_response) + late_field
self.audio_data = aug_ir_data
self.ensure_max_of_audio(
) # Assuming ensure_max_of_audio is a method defined elsewhere
return self

@ -0,0 +1,115 @@
import json
import shlex
import subprocess
import tempfile
from pathlib import Path
from typing import Tuple
import ffmpy
import numpy as np
import paddle
def r128stats(filepath: str, quiet: bool):
"""Takes a path to an audio file, returns a dict with the loudness
stats computed by the ffmpeg ebur128 filter.
Parameters
----------
filepath : str
Path to compute loudness stats on.
quiet : bool
Whether to show FFMPEG output during computation.
Returns
-------
dict
Dictionary containing loudness stats.
"""
ffargs = [
"ffmpeg",
"-nostats",
"-i",
filepath,
"-filter_complex",
"ebur128",
"-f",
"null",
"-",
]
if quiet:
ffargs += ["-hide_banner"]
proc = subprocess.Popen(
ffargs, stderr=subprocess.PIPE, universal_newlines=True)
stats = proc.communicate()[1]
summary_index = stats.rfind("Summary:")
summary_list = stats[summary_index:].split()
i_lufs = float(summary_list[summary_list.index("I:") + 1])
i_thresh = float(summary_list[summary_list.index("I:") + 4])
lra = float(summary_list[summary_list.index("LRA:") + 1])
lra_thresh = float(summary_list[summary_list.index("LRA:") + 4])
lra_low = float(summary_list[summary_list.index("low:") + 1])
lra_high = float(summary_list[summary_list.index("high:") + 1])
stats_dict = {
"I": i_lufs,
"I Threshold": i_thresh,
"LRA": lra,
"LRA Threshold": lra_thresh,
"LRA Low": lra_low,
"LRA High": lra_high,
}
return stats_dict
def ffprobe_offset_and_codec(path: str) -> Tuple[float, str]:
"""Given a path to a file, returns the start time offset and codec of
the first audio stream.
"""
ff = ffmpy.FFprobe(
inputs={path: None},
global_options="-show_entries format=start_time:stream=duration,start_time,codec_type,codec_name,start_pts,time_base -of json -v quiet",
)
streams = json.loads(ff.run(stdout=subprocess.PIPE)[0])["streams"]
seconds_offset = 0.0
codec = None
# Get the offset and codec of the first audio stream we find
# and return its start time, if it has one.
for stream in streams:
if stream["codec_type"] == "audio":
seconds_offset = stream.get("start_time", 0.0)
codec = stream.get("codec_name")
break
return float(seconds_offset), codec
class FFMPEGMixin:
_loudness = None
def ffmpeg_loudness(self, quiet: bool=True):
"""Computes loudness of audio file using FFMPEG.
Parameters
----------
quiet : bool, optional
Whether to show FFMPEG output during computation,
by default True
Returns
-------
paddle.Tensor
Loudness of every item in the batch, computed via
FFMPEG.
"""
loudness = []
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
for i in range(self.batch_size):
self[i].write(f.name)
loudness_stats = r128stats(f.name, quiet=quiet)
loudness.append(loudness_stats["I"])
self._loudness = paddle.to_tensor(np.array(loudness)).astype("float32")
return self.loudness()

@ -0,0 +1,338 @@
import copy
import numpy as np
import paddle
import paddle.nn.functional as F
import scipy
from . import _julius
class Meter(paddle.nn.Layer):
"""Tensorized version of pyloudnorm.Meter. Works with batched audio tensors.
Parameters
----------
rate : int
Sample rate of audio.
filter_class : str, optional
Class of weighting filter used.
K-weighting' (default), 'Fenton/Lee 1'
'Fenton/Lee 2', 'Dash et al.'
by default "K-weighting"
block_size : float, optional
Gating block size in seconds, by default 0.400
zeros : int, optional
Number of zeros to use in FIR approximation of
IIR filters, by default 512
use_fir : bool, optional
Whether to use FIR approximation or exact IIR formulation.
If computing on GPU, ``use_fir=True`` will be used, as its
much faster, by default False
"""
def __init__(
self,
rate: int,
filter_class: str="K-weighting",
block_size: float=0.400,
zeros: int=512,
use_fir: bool=False, ):
super().__init__()
self.rate = rate
self.filter_class = filter_class
self.block_size = block_size
self.use_fir = use_fir
G = paddle.to_tensor(
np.array([1.0, 1.0, 1.0, 1.41, 1.41]), stop_gradient=True)
self.register_buffer("G", G)
# Compute impulse responses so that filtering is fast via
# a convolution at runtime, on GPU, unlike lfilter.
impulse = np.zeros((zeros, ))
impulse[..., 0] = 1.0
firs = np.zeros((len(self._filters), 1, zeros))
# passband_gain = torch.zeros(len(self._filters))
passband_gain = paddle.zeros([len(self._filters)], dtype="float32")
for i, (_, filter_stage) in enumerate(self._filters.items()):
firs[i] = scipy.signal.lfilter(filter_stage.b, filter_stage.a,
impulse)
passband_gain[i] = filter_stage.passband_gain
firs = paddle.to_tensor(
firs[..., ::-1].copy(), dtype="float32", stop_gradient=True)
self.register_buffer("firs", firs)
self.register_buffer("passband_gain", passband_gain)
def apply_filter_gpu(self, data: paddle.Tensor):
"""Performs FIR approximation of loudness computation.
Parameters
----------
data : paddle.Tensor
Audio data of shape (nb, nch, nt).
Returns
-------
paddle.Tensor
Filtered audio data.
"""
# Data is of shape (nb, nch, nt)
# Reshape to (nb*nch, 1, nt)
nb, nt, nch = data.shape
data = data.transpose([0, 2, 1])
data = data.reshape([nb * nch, 1, nt])
# Apply padding
pad_length = self.firs.shape[-1]
# Apply filtering in sequence
for i in range(self.firs.shape[0]):
data = F.pad(data, (pad_length, pad_length), data_format="NCL")
data = _julius.fft_conv1d(data, self.firs[i, None, ...])
data = self.passband_gain[i] * data
data = data[..., 1:nt + 1]
data = data.transpose([0, 2, 1])
data = data[:, :nt, :]
return data
@staticmethod
def scipy_lfilter(waveform, a_coeffs, b_coeffs, clamp: bool=True):
# 使用 scipy.signal.lfilter 进行滤波(处理三维数据)
output = np.zeros_like(waveform)
for batch_idx in range(waveform.shape[0]):
for channel_idx in range(waveform.shape[2]):
output[batch_idx, :, channel_idx] = scipy.signal.lfilter(
b_coeffs, a_coeffs, waveform[batch_idx, :, channel_idx])
return output
def apply_filter_cpu(self, data: paddle.Tensor):
"""Performs IIR formulation of loudness computation.
Parameters
----------
data : paddle.Tensor
Audio data of shape (nb, nch, nt).
Returns
-------
paddle.Tensor
Filtered audio data.
"""
_data = data.cpu().numpy().copy()
for _, filter_stage in self._filters.items():
passband_gain = filter_stage.passband_gain
a_coeffs = filter_stage.a
b_coeffs = filter_stage.b
filtered = self.scipy_lfilter(_data, a_coeffs, b_coeffs)
_data[:] = passband_gain * filtered
data = paddle.to_tensor(_data)
return data
def apply_filter(self, data: paddle.Tensor):
"""Applies filter on either CPU or GPU, depending
on if the audio is on GPU or is on CPU, or if
``self.use_fir`` is True.
Parameters
----------
data : paddle.Tensor
Audio data of shape (nb, nch, nt).
Returns
-------
paddle.Tensor
Filtered audio data.
"""
if data.place.is_gpu_place() or self.use_fir:
data = self.apply_filter_gpu(data)
else:
data = self.apply_filter_cpu(data)
return data
def forward(self, data: paddle.Tensor):
"""Computes integrated loudness of data.
Parameters
----------
data : paddle.Tensor
Audio data of shape (nb, nch, nt).
Returns
-------
paddle.Tensor
Filtered audio data.
"""
return self.integrated_loudness(data)
def _unfold(self, input_data):
T_g = self.block_size
overlap = 0.75 # overlap of 75% of the block duration
step = 1.0 - overlap # step size by percentage
kernel_size = int(T_g * self.rate)
stride = int(T_g * self.rate * step)
unfolded = _julius.unfold(
input_data.transpose([0, 2, 1]), kernel_size, stride)
unfolded = unfolded.transpose([0, 1, 3, 2])
return unfolded
def integrated_loudness(self, data: paddle.Tensor):
"""Computes integrated loudness of data.
Parameters
----------
data : paddle.Tensor
Audio data of shape (nb, nch, nt).
Returns
-------
paddle.Tensor
Filtered audio data.
"""
if not paddle.is_tensor(data):
data = paddle.to_tensor(data, dtype="float32")
else:
data = data.astype("float32")
input_data = data.clone()
# Data always has a batch and channel dimension.
# Is of shape (nb, nt, nch)
if input_data.ndim < 2:
input_data = input_data.unsqueeze(-1)
if input_data.ndim < 3:
input_data = input_data.unsqueeze(0)
nb, nt, nch = input_data.shape
# Apply frequency weighting filters - account
# for the acoustic respose of the head and auditory system
input_data = self.apply_filter(input_data)
G = self.G # channel gains
T_g = self.block_size # 400 ms gating block standard
Gamma_a = -70.0 # -70 LKFS = absolute loudness threshold
unfolded = self._unfold(input_data)
z = (1.0 / (T_g * self.rate)) * unfolded.square().sum(2)
l = -0.691 + 10.0 * paddle.log10(
(G[None, :nch, None] * z).sum(1, keepdim=True))
l = l.expand_as(z)
# find gating block indices above absolute threshold
z_avg_gated = z
z_avg_gated[l <= Gamma_a] = 0
masked = l > Gamma_a
z_avg_gated = z_avg_gated.sum(2) / masked.sum(2)
# calculate the relative threshold value (see eq. 6)
Gamma_r = -0.691 + 10.0 * paddle.log10(
(z_avg_gated * G[None, :nch]).sum(-1)) - 10.0
Gamma_r = Gamma_r[:, None, None]
Gamma_r = Gamma_r.expand([nb, nch, l.shape[-1]])
# find gating block indices above relative and absolute thresholds (end of eq. 7)
z_avg_gated = z
z_avg_gated[l <= Gamma_a] = 0
z_avg_gated[l <= Gamma_r] = 0
masked = (l > Gamma_a) * (l > Gamma_r)
z_avg_gated = z_avg_gated.sum(2) / masked.sum(2)
# # Cannot use nan_to_num (pytorch 1.8 does not come with GCP-supported cuda version)
# z_avg_gated = torch.nan_to_num(z_avg_gated)
z_avg_gated = paddle.where(
paddle.isnan(z_avg_gated),
paddle.zeros_like(z_avg_gated), z_avg_gated)
z_avg_gated[z_avg_gated == float("inf")] = float(
np.finfo(np.float32).max)
z_avg_gated[z_avg_gated == -float("inf")] = float(
np.finfo(np.float32).min)
LUFS = -0.691 + 10.0 * paddle.log10(
(G[None, :nch] * z_avg_gated).sum(1))
return LUFS.astype("float32")
@property
def filter_class(self):
return self._filter_class
@filter_class.setter
def filter_class(self, value):
from pyloudnorm import Meter
meter = Meter(self.rate)
meter.filter_class = value
self._filter_class = value
self._filters = meter._filters
class LoudnessMixin:
_loudness = None
MIN_LOUDNESS = -70
"""Minimum loudness possible."""
def loudness(self,
filter_class: str="K-weighting",
block_size: float=0.400,
**kwargs):
"""Calculates loudness using an implementation of ITU-R BS.1770-4.
Allows control over gating block size and frequency weighting filters for
additional control. Measure the integrated gated loudness of a signal.
API is derived from PyLoudnorm, but this implementation is ported to PyTorch
and is tensorized across batches. When on GPU, an FIR approximation of the IIR
filters is used to compute loudness for speed.
Uses the weighting filters and block size defined by the meter
the integrated loudness is measured based upon the gating algorithm
defined in the ITU-R BS.1770-4 specification.
Parameters
----------
filter_class : str, optional
Class of weighting filter used.
K-weighting' (default), 'Fenton/Lee 1'
'Fenton/Lee 2', 'Dash et al.'
by default "K-weighting"
block_size : float, optional
Gating block size in seconds, by default 0.400
kwargs : dict, optional
Keyword arguments to :py:func:`audiotools.core.loudness.Meter`.
Returns
-------
paddle.Tensor
Loudness of audio data.
"""
if self._loudness is not None:
return self._loudness # .to(self.device)
original_length = self.signal_length
if self.signal_duration < 0.5:
pad_len = int((0.5 - self.signal_duration) * self.sample_rate)
self.zero_pad(0, pad_len)
# create BS.1770 meter
meter = Meter(
self.sample_rate,
filter_class=filter_class,
block_size=block_size,
**kwargs)
# meter = meter.to(self.device)
# measure loudness
loudness = meter.integrated_loudness(
self.audio_data.transpose([0, 2, 1]))
self.truncate_samples(original_length)
min_loudness = paddle.ones_like(loudness) * self.MIN_LOUDNESS
self._loudness = paddle.maximum(loudness, min_loudness)
return self._loudness # .to(self.device)

@ -168,15 +168,17 @@ class ResampleFrac(paddle.nn.Layer):
if self.old_sr == self.new_sr:
return x
shape = x.shape
_dtype = x.dtype
length = x.shape[-1]
x = x.reshape([-1, length])
x = F.pad(
x.unsqueeze(1),
[self._width, self._width + self.old_sr],
mode="replicate",
data_format="NCL", )
data_format="NCL", ).astype(self.kernel.dtype)
ys = F.conv1d(x, self.kernel, stride=self.old_sr, data_format="NCL")
y = ys.transpose([0, 2, 1]).reshape(list(shape[:-1]) + [-1])
y = ys.transpose(
[0, 2, 1]).reshape(list(shape[:-1]) + [-1]).astype(_dtype)
float_output_length = paddle.to_tensor(
self.new_sr * length / self.old_sr, dtype="float32")

@ -1,3 +1,4 @@
import collections
import csv
import glob
import math
@ -8,19 +9,27 @@ import typing
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from typing import Callable
from typing import Dict
from typing import Iterable
from typing import List
from typing import Optional, Union, Type, Any, Callable, Tuple, NamedTuple, Iterable
import collections
from typing import NamedTuple
from typing import Optional
from typing import Tuple
from typing import Type
from typing import Union
import librosa
import numpy as np
import paddle
import soundfile
from audio_signal import AudioSignal
from flatten_dict import flatten
from flatten_dict import unflatten
from ..data.preprocess import create_csv
from .audio_signal import AudioSignal
# from ..data.preprocess import create_csv
@dataclass
@ -51,8 +60,7 @@ def info(audio_path: str):
def ensure_tensor(
x: typing.Union[np.ndarray, paddle.Tensor, float, int],
ndim: int=None,
batch_size: int = None,
):
batch_size: int=None, ):
"""✅Ensures that the input ``x`` is a tensor of specified
dimensions and batch size.
@ -86,8 +94,7 @@ def ensure_tensor(
def _get_value(other):
# ✅
# from . import AudioSignal
from audio_signal import AudioSignal
from . import AudioSignal
if isinstance(other, AudioSignal):
return other.audio_data
@ -123,10 +130,11 @@ def random_state(seed: typing.Union[int, np.random.RandomState]):
elif isinstance(seed, np.random.RandomState):
return seed
else:
raise ValueError("%r cannot be used to seed a numpy.random.RandomState" " instance" % seed)
raise ValueError("%r cannot be used to seed a numpy.random.RandomState"
" instance" % seed)
def seed(random_seed):
def seed(random_seed, **kwargs):
"""
Seeds all random states with the same random seed
for reproducibility. Seeds ``numpy``, ``random`` and ``paddle``
@ -209,8 +217,7 @@ def read_sources(
sources: List[str],
remove_empty: bool=True,
relative_path: str="",
ext: List[str] = AUDIO_EXTENSIONS,
):
ext: List[str]=AUDIO_EXTENSIONS, ):
"""✅Reads audio sources that can either be folders
full of audio files, or CSV files that contain paths
to audio files. CSV files that adhere to the expected
@ -253,7 +260,9 @@ def read_sources(
return files
def choose_from_list_of_lists(state: np.random.RandomState, list_of_lists: list, p: float = None):
def choose_from_list_of_lists(state: np.random.RandomState,
list_of_lists: list,
p: float=None):
"""✅Choose a single item from a list of lists.
Parameters
@ -295,7 +304,8 @@ def chdir(newdir: typing.Union[Path, str]):
os.chdir(curdir)
def prepare_batch(batch: typing.Union[dict, list, paddle.Tensor], device: str = "cpu"):
def prepare_batch(batch: typing.Union[dict, list, paddle.Tensor],
device: str="cpu"):
"""✅Moves items in a batch (typically generated by a DataLoader as a list
or a dict) to the specified device. This works even if dictionaries
are nested.
@ -386,8 +396,7 @@ def format_figure(
fig=None,
format_axes: bool=True,
format: bool=True,
font_color: str = "white",
):
font_color: str="white", ):
"""✅Prettifies the spectrogram and waveform plots. A title
can be inset into the top right corner, and the axes can be
inset into the figure, allowing the data to take up the entire
@ -447,8 +456,7 @@ def format_figure(
va="top",
color=font_color,
fontsize=12 * font_scale,
alpha=0.75,
)
alpha=0.75, )
ticks = ax.get_xticks()[2:]
for t in ticks[:-1]:
@ -462,15 +470,15 @@ def format_figure(
va="bottom",
color=font_color,
fontsize=12 * font_scale,
alpha=0.75,
)
alpha=0.75, )
ax.margins(0, 0)
ax.set_axis_off()
ax.xaxis.set_major_locator(plt.NullLocator())
ax.yaxis.set_major_locator(plt.NullLocator())
plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
plt.subplots_adjust(
top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
if title is not None:
t = axs[0].annotate(
@ -482,17 +490,61 @@ def format_figure(
textcoords="offset points",
ha="right",
va="top",
color="white",
)
color="white", )
t.set_bbox(dict(facecolor="black", alpha=0.5, edgecolor="black"))
_default_collate_err_msg_format = (
"default_collate: batch must contain tensors, numpy arrays, numbers, " "dicts or lists; found {}"
)
"default_collate: batch must contain tensors, numpy arrays, numbers, "
"dicts or lists; found {}")
def collate_tensor_fn(
batch,
*,
collate_fn_map: Optional[Dict[Union[type, Tuple[type, ...]],
Callable]]=None, ):
out = paddle.stack(batch, axis=0)
return out
def collate_float_fn(
batch,
*,
collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]],
Callable]]=None, ):
return paddle.to_tensor(batch, dtype=paddle.float64)
def collate_int_fn(
batch,
*,
collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]],
Callable]]=None, ):
return paddle.to_tensor(batch)
def default_collate(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
def collate_str_fn(
batch,
*,
collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]],
Callable]]=None, ):
return batch
default_collate_fn_map: Dict[Union[Type, Tuple[Type, ...]], Callable] = {
paddle.Tensor: collate_tensor_fn
}
default_collate_fn_map[float] = collate_float_fn
default_collate_fn_map[int] = collate_int_fn
default_collate_fn_map[str] = collate_str_fn
default_collate_fn_map[bytes] = collate_str_fn
def default_collate(batch,
*,
collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]],
Callable]]=None):
r"""
General collate function that handles collection type of element within each batch.
@ -514,38 +566,58 @@ def default_collate(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Ty
if collate_fn_map is not None:
if elem_type in collate_fn_map:
return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
return collate_fn_map[elem_type](
batch, collate_fn_map=collate_fn_map)
for collate_type in collate_fn_map:
if isinstance(elem, collate_type):
return collate_fn_map[collate_type](batch, collate_fn_map=collate_fn_map)
return collate_fn_map[collate_type](
batch, collate_fn_map=collate_fn_map)
if isinstance(elem, collections.abc.Mapping):
try:
return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
return elem_type({
key: default_collate(
[d[key] for d in batch], collate_fn_map=collate_fn_map)
for key in elem
})
except TypeError:
# The mapping type may not support `__init__(iterable)`.
return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
return {
key: default_collate(
[d[key] for d in batch], collate_fn_map=collate_fn_map)
for key in elem
}
elif isinstance(elem, tuple) and hasattr(elem, "_fields"): # namedtuple
return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
return elem_type(*(default_collate(
samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
elif isinstance(elem, collections.abc.Sequence):
# check to make sure that the elements in batch have consistent size
it = iter(batch)
elem_size = len(next(it))
if not all(len(elem) == elem_size for elem in it):
raise RuntimeError("each element in list of batch should be of equal size")
transposed = list(zip(*batch)) # It may be accessed twice, so we use a list.
raise RuntimeError(
"each element in list of batch should be of equal size")
transposed = list(zip(
*batch)) # It may be accessed twice, so we use a list.
if isinstance(elem, tuple):
return [
collate(samples, collate_fn_map=collate_fn_map) for samples in transposed
default_collate(samples, collate_fn_map=collate_fn_map)
for samples in transposed
] # Backwards compatibility.
else:
try:
return elem_type([collate(samples, collate_fn_map=collate_fn_map) for samples in transposed])
return elem_type([
default_collate(samples, collate_fn_map=collate_fn_map)
for samples in transposed
])
except TypeError:
# The sequence type may not support `__init__(iterable)` (e.g., `range`).
return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
return [
default_collate(samples, collate_fn_map=collate_fn_map)
for samples in transposed
]
raise TypeError(_default_collate_err_msg_format.format(elem_type))
@ -586,7 +658,10 @@ def collate(list_of_dicts: list, n_splits: int = None):
for i in range(0, list_len, n_items):
# Flatten the dictionaries to avoid recursion.
list_of_dicts_ = [flatten(d) for d in list_of_dicts[i:i + n_items]]
dict_of_lists = {k: [dic[k] for dic in list_of_dicts_] for k in list_of_dicts_[0]}
dict_of_lists = {
k: [dic[k] for dic in list_of_dicts_]
for k in list_of_dicts_[0]
}
batch = {}
for k, v in dict_of_lists.items():
@ -594,9 +669,117 @@ def collate(list_of_dicts: list, n_splits: int = None):
if all(isinstance(s, AudioSignal) for s in v):
batch[k] = AudioSignal.batch(v, pad_signals=True)
else:
# Borrow the default collate fn from torch.
batch[k] = default_collate(v)
batch[k] = default_collate(
v, collate_fn_map=default_collate_fn_map)
batches.append(unflatten(batch))
batches = batches[0] if not return_list else batches
return batches
def hz_to_bin(hz: paddle.Tensor, n_fft: int, sample_rate: int):
"""Closest frequency bin given a frequency, number
of bins, and a sampling rate.
Parameters
----------
hz : paddle.Tensor
Tensor of frequencies in Hz.
n_fft : int
Number of FFT bins.
sample_rate : int
Sample rate of audio.
Returns
-------
paddle.Tensor
Closest bins to the data.
"""
shape = hz.shape
hz = hz.reshape([-1])
freqs = paddle.linspace(0, sample_rate / 2, 2 + n_fft // 2)
hz = paddle.clip(hz, max=sample_rate / 2)
closest = (hz[None, :] - freqs[:, None]).abs()
closest_bins = closest.argmin(axis=0)
return closest_bins.reshape(shape)
def generate_chord_dataset(
max_voices: int=8,
sample_rate: int=44100,
num_items: int=5,
duration: float=1.0,
min_note: str="C2",
max_note: str="C6",
output_dir: Path="chords", ):
"""
Generates a toy multitrack dataset of chords, synthesized from sine waves.
Parameters
----------
max_voices : int, optional
Maximum number of voices in a chord, by default 8
sample_rate : int, optional
Sample rate of audio, by default 44100
num_items : int, optional
Number of items to generate, by default 5
duration : float, optional
Duration of each item, by default 1.0
min_note : str, optional
Minimum note in the dataset, by default "C2"
max_note : str, optional
Maximum note in the dataset, by default "C6"
output_dir : Path, optional
Directory to save the dataset, by default "chords"
"""
import librosa
from . import AudioSignal
from ..data.preprocess import create_csv
min_midi = librosa.note_to_midi(min_note)
max_midi = librosa.note_to_midi(max_note)
tracks = []
for idx in range(num_items):
track = {}
# figure out how many voices to put in this track
num_voices = random.randint(1, max_voices)
for voice_idx in range(num_voices):
# choose some random params
midinote = random.randint(min_midi, max_midi)
dur = random.uniform(0.85 * duration, duration)
sig = AudioSignal.wave(
frequency=librosa.midi_to_hz(midinote),
duration=dur,
sample_rate=sample_rate,
shape="sine", )
track[f"voice_{voice_idx}"] = sig
tracks.append(track)
# save the tracks to disk
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
for idx, track in enumerate(tracks):
track_dir = output_dir / f"track_{idx}"
track_dir.mkdir(exist_ok=True)
for voice_name, sig in track.items():
sig.write(track_dir / f"{voice_name}.wav")
all_voices = list(set([k for track in tracks for k in track.keys()]))
voice_lists = {voice: [] for voice in all_voices}
for track in tracks:
for voice_name in all_voices:
if voice_name in track:
voice_lists[voice_name].append(track[voice_name].path_to_file)
else:
voice_lists[voice_name].append("")
for voice_name, paths in voice_lists.items():
create_csv(paths, output_dir / f"{voice_name}.csv", loudness=True)
return output_dir

@ -5,10 +5,12 @@ from typing import List
from typing import Union
import numpy as np
from audio_signal import AudioSignal
import util
import paddle
from paddle.io import SequenceSampler, DistributedBatchSampler
from paddle.io import DistributedBatchSampler
from paddle.io import SequenceSampler
from ..core import AudioSignal
from ..core import util
class AudioLoader:
@ -48,13 +50,13 @@ class AudioLoader:
relative_path: str="",
ext: List[str]=util.AUDIO_EXTENSIONS,
shuffle: bool=True,
shuffle_state: int = 0,
):
self.audio_lists = util.read_sources(sources, relative_path=relative_path, ext=ext)
shuffle_state: int=0, ):
self.audio_lists = util.read_sources(
sources, relative_path=relative_path, ext=ext)
self.audio_indices = [
(src_idx, item_idx) for src_idx, src in enumerate(self.audio_lists) for item_idx in range(len(src))
]
self.audio_indices = [(src_idx, item_idx)
for src_idx, src in enumerate(self.audio_lists)
for item_idx in range(len(src))]
if shuffle:
state = util.random_state(shuffle_state)
state.shuffle(self.audio_indices)
@ -73,18 +75,19 @@ class AudioLoader:
offset: float=None,
source_idx: int=None,
item_idx: int=None,
global_idx: int = None,
):
global_idx: int=None, ):
if source_idx is not None and item_idx is not None:
try:
audio_info = self.audio_lists[source_idx][item_idx]
except:
audio_info = {"path": "none"}
elif global_idx is not None:
source_idx, item_idx = self.audio_indices[global_idx % len(self.audio_indices)]
source_idx, item_idx = self.audio_indices[global_idx %
len(self.audio_indices)]
audio_info = self.audio_lists[source_idx][item_idx]
else:
audio_info, source_idx, item_idx = util.choose_from_list_of_lists(state, self.audio_lists, p=self.weights)
audio_info, source_idx, item_idx = util.choose_from_list_of_lists(
state, self.audio_lists, p=self.weights)
path = audio_info["path"]
signal = AudioSignal.zeros(duration, sample_rate, num_channels)
@ -95,14 +98,12 @@ class AudioLoader:
path,
duration=duration,
state=state,
loudness_cutoff=loudness_cutoff,
)
loudness_cutoff=loudness_cutoff, )
else:
signal = AudioSignal(
path,
offset=offset,
duration=duration,
)
duration=duration, )
if num_channels == 1:
signal = signal.to_mono()
@ -122,7 +123,8 @@ class AudioLoader:
"path": str(path),
}
if self.transform is not None:
item["transform_args"] = self.transform.instantiate(state, signal=signal)
item["transform_args"] = self.transform.instantiate(
state, signal=signal)
return item
@ -348,7 +350,8 @@ class AudioDataset:
def __init__(
self,
loaders: Union[AudioLoader, List[AudioLoader], Dict[str, AudioLoader]],
loaders: Union[AudioLoader, List[AudioLoader], Dict[str,
AudioLoader]],
sample_rate: int,
n_examples: int=1000,
duration: float=0.5,
@ -359,8 +362,7 @@ class AudioDataset:
aligned: bool=False,
shuffle_loaders: bool=False,
matcher: Callable=default_matcher,
without_replacement: bool = True,
):
without_replacement: bool=True, ):
# Internally we convert loaders to a dictionary
if isinstance(loaders, list):
loaders = {i: l for i, l in enumerate(loaders)}
@ -415,13 +417,11 @@ class AudioDataset:
# Path mapper takes the current loader + everything
# returned by the first loader.
offset = item[keys[0]]["signal"].metadata["offset"]
loader_kwargs.update(
{
loader_kwargs.update({
"offset": offset,
"source_idx": item[keys[0]]["source_idx"],
"item_idx": item[keys[0]]["item_idx"],
}
)
})
item[key] = loader(**loader_kwargs)
# Sort dictionary back into original order
@ -430,7 +430,8 @@ class AudioDataset:
item["idx"] = idx
if self.transform is not None:
item["transform_args"] = self.transform.instantiate(state=state, signal=item[keys[0]]["signal"])
item["transform_args"] = self.transform.instantiate(
state=state, signal=item[keys[0]]["signal"])
# If there's only one loader, pop it up
# to the main dictionary, instead of keeping it
@ -495,24 +496,29 @@ class ConcatDataset(AudioDataset):
class ResumableDistributedSampler(DistributedBatchSampler): # pragma: no cover
"""Distributed sampler that can be resumed from a given start index."""
def __init__(
self, dataset, batch_size, start_idx: int = None, num_replicas=None, rank=None, shuffle=False, drop_last=False
):
def __init__(self,
dataset,
batch_size,
start_idx: int=None,
num_replicas=None,
rank=None,
shuffle=False,
drop_last=False):
super().__init__(
dataset=dataset,
batch_size=batch_size,
num_replicas=num_replicas,
rank=rank,
shuffle=shuffle,
drop_last=drop_last,
)
drop_last=drop_last, )
# Start index, allows to resume an experiment at the index it was
if start_idx is not None:
self.start_idx = start_idx // self.num_replicas
else:
self.start_idx = 0
# 重新计算样本总数,因为 DistributedBatchSampler 的 __len__ 方法是基于 shuffle 后的样本总数计算的
self.total_size = len(self.dataset) if not shuffle else len(self.indices)
self.total_size = len(self.dataset) if not shuffle else len(
self.indices)
def __iter__(self):
# 由于 Paddle 的 DistributedBatchSampler 直接返回 batch我们需要将其展开为单个索引

@ -5,18 +5,15 @@ from typing import List
import numpy as np
import paddle
# import torch
from flatten_dict import flatten
from flatten_dict import unflatten
from numpy.random import RandomState
# from .. import ml
from .. import ml
from ..core import AudioSignal
from ..core import util
from .datasets import AudioLoader
tt = paddle.to_tensor
class BaseTransform:
"""✅This is the base class for all transforms that are implemented
@ -112,14 +109,14 @@ class BaseTransform:
return {}
@staticmethod
def apply_mask(batch: dict, mask: torch.Tensor):
def apply_mask(batch: dict, mask: paddle.Tensor):
"""Applies a mask to the batch.
Parameters
----------
batch : dict
Batch whose values will be masked in the ``transform`` pass.
mask : torch.Tensor
mask : paddle.Tensor
Mask to apply to batch.
Returns
@ -127,7 +124,16 @@ class BaseTransform:
dict
A dictionary that contains values only where ``mask = True``.
"""
masked_batch = {k: v[mask] for k, v in flatten(batch).items()}
# masked_batch = {k: v[mask] for k, v in flatten(batch).items()}
masked_batch = {}
for k, v in flatten(batch).items():
if 0 == mask.dim() and 0 == v.dim():
if mask: # 0d 的 True
masked_batch[k] = v[None]
else:
masked_batch[k] = paddle.to_tensor([], dtype=v.dtype)
else:
masked_batch[k] = v[mask]
return unflatten(masked_batch)
def transform(self, signal: AudioSignal, **kwargs):
@ -158,7 +164,7 @@ class BaseTransform:
tfm_kwargs = self._prepare(kwargs)
mask = tfm_kwargs["mask"]
if torch.any(mask):
if paddle.any(mask):
tfm_kwargs = self.apply_mask(tfm_kwargs, mask)
tfm_kwargs = {k: v for k, v in tfm_kwargs.items() if k != "mask"}
signal[mask] = self._transform(signal[mask], **tfm_kwargs)
@ -171,8 +177,7 @@ class BaseTransform:
def instantiate(
self,
state: RandomState=None,
signal: AudioSignal = None,
):
signal: AudioSignal=None, ):
"""Instantiates parameters for the transform.
Parameters
@ -202,7 +207,8 @@ class BaseTransform:
# is needed before passing it in, so that the end-user
# doesn't need to have variables they're not using flowing
# into their function.
needs_signal = "signal" in set(signature(self._instantiate).parameters.keys())
needs_signal = "signal" in set(
signature(self._instantiate).parameters.keys())
kwargs = {}
if needs_signal:
kwargs = {"signal": signal}
@ -211,12 +217,12 @@ class BaseTransform:
params = self._instantiate(state, **kwargs)
for k in list(params.keys()):
v = params[k]
if isinstance(v, (AudioSignal, torch.Tensor, dict)):
if isinstance(v, (AudioSignal, paddle.Tensor, dict)):
params[k] = v
else:
params[k] = tt(v)
params[k] = paddle.to_tensor(v)
mask = state.rand() <= self.prob
params[f"mask"] = tt(mask)
params[f"mask"] = paddle.to_tensor(mask)
# Put the params into a nested dictionary that will be
# used later when calling the transform. This is to avoid
@ -228,8 +234,7 @@ class BaseTransform:
def batch_instantiate(
self,
states: list=None,
signal: AudioSignal = None,
):
signal: AudioSignal=None, ):
"""Instantiates arguments for every item in a batch,
given a list of states. Each state in the list
corresponds to one item in the batch.
@ -452,8 +457,7 @@ class Choose(Compose):
*transforms: list,
weights: list=None,
name: str=None,
prob: float = 1.0,
):
prob: float=1.0, ):
super().__init__(*transforms, name=name, prob=prob)
if weights is None:
@ -491,8 +495,7 @@ class Repeat(Compose):
transform,
n_repeat: int=1,
name: str=None,
prob: float = 1.0,
):
prob: float=1.0, ):
transforms = [copy.copy(transform) for _ in range(n_repeat)]
super().__init__(transforms, name=name, prob=prob)
@ -518,8 +521,7 @@ class RepeatUpTo(Choose):
max_repeat: int=5,
weights: list=None,
name: str=None,
prob: float = 1.0,
):
prob: float=1.0, ):
transforms = []
for n in range(1, max_repeat):
transforms.append(Repeat(transform, n_repeat=n))
@ -548,8 +550,7 @@ class ClippingDistortion(BaseTransform):
self,
perc: tuple=("uniform", 0.0, 0.1),
name: str=None,
prob: float = 1.0,
):
prob: float=1.0, ):
super().__init__(name=name, prob=prob)
self.perc = perc
@ -561,43 +562,42 @@ class ClippingDistortion(BaseTransform):
return signal.clip_distortion(perc)
# class Equalizer(BaseTransform):
# """❌Applies an equalization curve to the audio signal. Corresponds
# to :py:func:`audiotools.core.effects.EffectMixin.equalizer`.
class Equalizer(BaseTransform):
"""Applies an equalization curve to the audio signal. Corresponds
to :py:func:`audiotools.core.effects.EffectMixin.equalizer`.
# Parameters
# ----------
# eq_amount : tuple, optional
# The maximum dB cut to apply to the audio in any band,
# by default ("const", 1.0 dB)
# n_bands : int, optional
# Number of bands in EQ, by default 6
# name : str, optional
# Name of this transform, used to identify it in the dictionary
# produced by ``self.instantiate``, by default None
# prob : float, optional
# Probability of applying this transform, by default 1.0
# """
Parameters
----------
eq_amount : tuple, optional
The maximum dB cut to apply to the audio in any band,
by default ("const", 1.0 dB)
n_bands : int, optional
Number of bands in EQ, by default 6
name : str, optional
Name of this transform, used to identify it in the dictionary
produced by ``self.instantiate``, by default None
prob : float, optional
Probability of applying this transform, by default 1.0
"""
# def __init__(
# self,
# eq_amount: tuple = ("const", 1.0),
# n_bands: int = 6,
# name: str = None,
# prob: float = 1.0,
# ):
# super().__init__(name=name, prob=prob)
def __init__(
self,
eq_amount: tuple=("const", 1.0),
n_bands: int=6,
name: str=None,
prob: float=1.0, ):
super().__init__(name=name, prob=prob)
# self.eq_amount = eq_amount
# self.n_bands = n_bands
self.eq_amount = eq_amount
self.n_bands = n_bands
# def _instantiate(self, state: RandomState):
# eq_amount = util.sample_from_dist(self.eq_amount, state)
# eq = -eq_amount * state.rand(self.n_bands)
# return {"eq": eq}
def _instantiate(self, state: RandomState):
eq_amount = util.sample_from_dist(self.eq_amount, state)
eq = -eq_amount * state.rand(self.n_bands)
return {"eq": eq}
# def _transform(self, signal, eq):
# return signal.equalizer(eq)
def _transform(self, signal, eq):
return signal.equalizer(eq)
# class Quantization(BaseTransform):
@ -632,7 +632,6 @@ class ClippingDistortion(BaseTransform):
# def _transform(self, signal, channels):
# return signal.quantization(channels)
# class MuLawQuantization(BaseTransform):
# """Applies mu-law quantization to the input waveform. Corresponds
# to :py:func:`audiotools.core.effects.EffectMixin.mulaw_quantization`.
@ -665,7 +664,6 @@ class ClippingDistortion(BaseTransform):
# def _transform(self, signal, channels):
# return signal.mulaw_quantization(channels)
# class NoiseFloor(BaseTransform):
# """Adds a noise floor of Gaussian noise to the signal at a specified
# dB.
@ -704,92 +702,90 @@ class ClippingDistortion(BaseTransform):
# return signal + nz_signal
# class BackgroundNoise(BaseTransform):
# """Adds background noise from audio specified by a set of CSV files.
# A valid CSV file looks like, and is typically generated by
# :py:func:`audiotools.data.preprocess.create_csv`:
class BackgroundNoise(BaseTransform):
"""Adds background noise from audio specified by a set of CSV files.
A valid CSV file looks like, and is typically generated by
:py:func:`audiotools.data.preprocess.create_csv`:
# .. csv-table::
# :header: path
# room_tone/m6_script2_clean.wav
# room_tone/m6_script2_cleanraw.wav
# room_tone/m6_script2_ipad_balcony1.wav
# room_tone/m6_script2_ipad_bedroom1.wav
# room_tone/m6_script2_ipad_confroom1.wav
# room_tone/m6_script2_ipad_confroom2.wav
# room_tone/m6_script2_ipad_livingroom1.wav
# room_tone/m6_script2_ipad_office1.wav
# .. note::
# All paths are relative to an environment variable called ``PATH_TO_DATA``,
# so that CSV files are portable across machines where data may be
# located in different places.
.. csv-table::
:header: path
# This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix`
# and :py:func:`audiotools.core.effects.EffectMixin.equalizer` under the
# hood.
room_tone/m6_script2_clean.wav
room_tone/m6_script2_cleanraw.wav
room_tone/m6_script2_ipad_balcony1.wav
room_tone/m6_script2_ipad_bedroom1.wav
room_tone/m6_script2_ipad_confroom1.wav
room_tone/m6_script2_ipad_confroom2.wav
room_tone/m6_script2_ipad_livingroom1.wav
room_tone/m6_script2_ipad_office1.wav
# Parameters
# ----------
# snr : tuple, optional
# Signal-to-noise ratio, by default ("uniform", 10.0, 30.0)
# sources : List[str], optional
# Sources containing folders, or CSVs with paths to audio files,
# by default None
# weights : List[float], optional
# Weights to sample audio files from each source, by default None
# eq_amount : tuple, optional
# Amount of equalization to apply, by default ("const", 1.0)
# n_bands : int, optional
# Number of bands in equalizer, by default 3
# name : str, optional
# Name of this transform, used to identify it in the dictionary
# produced by ``self.instantiate``, by default None
# prob : float, optional
# Probability of applying this transform, by default 1.0
# loudness_cutoff : float, optional
# Loudness cutoff when loading from audio files, by default None
# """
.. note::
All paths are relative to an environment variable called ``PATH_TO_DATA``,
so that CSV files are portable across machines where data may be
located in different places.
# def __init__(
# self,
# snr: tuple = ("uniform", 10.0, 30.0),
# sources: List[str] = None,
# weights: List[float] = None,
# eq_amount: tuple = ("const", 1.0),
# n_bands: int = 3,
# name: str = None,
# prob: float = 1.0,
# loudness_cutoff: float = None,
# ):
# super().__init__(name=name, prob=prob)
This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix`
and :py:func:`audiotools.core.effects.EffectMixin.equalizer` under the
hood.
# self.snr = snr
# self.eq_amount = eq_amount
# self.n_bands = n_bands
# self.loader = AudioLoader(sources, weights)
# self.loudness_cutoff = loudness_cutoff
Parameters
----------
snr : tuple, optional
Signal-to-noise ratio, by default ("uniform", 10.0, 30.0)
sources : List[str], optional
Sources containing folders, or CSVs with paths to audio files,
by default None
weights : List[float], optional
Weights to sample audio files from each source, by default None
eq_amount : tuple, optional
Amount of equalization to apply, by default ("const", 1.0)
n_bands : int, optional
Number of bands in equalizer, by default 3
name : str, optional
Name of this transform, used to identify it in the dictionary
produced by ``self.instantiate``, by default None
prob : float, optional
Probability of applying this transform, by default 1.0
loudness_cutoff : float, optional
Loudness cutoff when loading from audio files, by default None
"""
# def _instantiate(self, state: RandomState, signal: AudioSignal):
# eq_amount = util.sample_from_dist(self.eq_amount, state)
# eq = -eq_amount * state.rand(self.n_bands)
# snr = util.sample_from_dist(self.snr, state)
def __init__(
self,
snr: tuple=("uniform", 10.0, 30.0),
sources: List[str]=None,
weights: List[float]=None,
eq_amount: tuple=("const", 1.0),
n_bands: int=3,
name: str=None,
prob: float=1.0,
loudness_cutoff: float=None, ):
super().__init__(name=name, prob=prob)
# bg_signal = self.loader(
# state,
# signal.sample_rate,
# duration=signal.signal_duration,
# loudness_cutoff=self.loudness_cutoff,
# num_channels=signal.num_channels,
# )["signal"]
self.snr = snr
self.eq_amount = eq_amount
self.n_bands = n_bands
self.loader = AudioLoader(sources, weights)
self.loudness_cutoff = loudness_cutoff
# return {"eq": eq, "bg_signal": bg_signal, "snr": snr}
def _instantiate(self, state: RandomState, signal: AudioSignal):
eq_amount = util.sample_from_dist(self.eq_amount, state)
eq = -eq_amount * state.rand(self.n_bands)
snr = util.sample_from_dist(self.snr, state)
# def _transform(self, signal, bg_signal, snr, eq):
# # Clone bg_signal so that transform can be repeatedly applied
# # to different signals with the same effect.
# return signal.mix(bg_signal.clone(), snr, eq)
bg_signal = self.loader(
state,
signal.sample_rate,
duration=signal.signal_duration,
loudness_cutoff=self.loudness_cutoff,
num_channels=signal.num_channels, )["signal"]
return {"eq": eq, "bg_signal": bg_signal, "snr": snr}
def _transform(self, signal, bg_signal, snr, eq):
# Clone bg_signal so that transform can be repeatedly applied
# to different signals with the same effect.
return signal.mix(bg_signal.clone(), snr, eq)
# class CrossTalk(BaseTransform):
@ -854,88 +850,88 @@ class ClippingDistortion(BaseTransform):
# return mix
# class RoomImpulseResponse(BaseTransform):
# """Convolves signal with a room impulse response, at a specified
# direct-to-reverberant ratio, with equalization applied. Room impulse
# response data is drawn from a CSV file that was produced via
# :py:func:`audiotools.data.preprocess.create_csv`.
# This transform calls :py:func:`audiotools.core.effects.EffectMixin.apply_ir`
# under the hood.
# Parameters
# ----------
# drr : tuple, optional
# _description_, by default ("uniform", 0.0, 30.0)
# sources : List[str], optional
# Sources containing folders, or CSVs with paths to audio files,
# by default None
# weights : List[float], optional
# Weights to sample audio files from each source, by default None
# eq_amount : tuple, optional
# Amount of equalization to apply, by default ("const", 1.0)
# n_bands : int, optional
# Number of bands in equalizer, by default 6
# name : str, optional
# Name of this transform, used to identify it in the dictionary
# produced by ``self.instantiate``, by default None
# prob : float, optional
# Probability of applying this transform, by default 1.0
# use_original_phase : bool, optional
# Whether or not to use the original phase, by default False
# offset : float, optional
# Offset from each impulse response file to use, by default 0.0
# duration : float, optional
# Duration of each impulse response, by default 1.0
# """
# def __init__(
# self,
# drr: tuple = ("uniform", 0.0, 30.0),
# sources: List[str] = None,
# weights: List[float] = None,
# eq_amount: tuple = ("const", 1.0),
# n_bands: int = 6,
# name: str = None,
# prob: float = 1.0,
# use_original_phase: bool = False,
# offset: float = 0.0,
# duration: float = 1.0,
# ):
# super().__init__(name=name, prob=prob)
class RoomImpulseResponse(BaseTransform):
"""Convolves signal with a room impulse response, at a specified
direct-to-reverberant ratio, with equalization applied. Room impulse
response data is drawn from a CSV file that was produced via
:py:func:`audiotools.data.preprocess.create_csv`.
# self.drr = drr
# self.eq_amount = eq_amount
# self.n_bands = n_bands
# self.use_original_phase = use_original_phase
This transform calls :py:func:`audiotools.core.effects.EffectMixin.apply_ir`
under the hood.
# self.loader = AudioLoader(sources, weights)
# self.offset = offset
# self.duration = duration
Parameters
----------
drr : tuple, optional
_description_, by default ("uniform", 0.0, 30.0)
sources : List[str], optional
Sources containing folders, or CSVs with paths to audio files,
by default None
weights : List[float], optional
Weights to sample audio files from each source, by default None
eq_amount : tuple, optional
Amount of equalization to apply, by default ("const", 1.0)
n_bands : int, optional
Number of bands in equalizer, by default 6
name : str, optional
Name of this transform, used to identify it in the dictionary
produced by ``self.instantiate``, by default None
prob : float, optional
Probability of applying this transform, by default 1.0
use_original_phase : bool, optional
Whether or not to use the original phase, by default False
offset : float, optional
Offset from each impulse response file to use, by default 0.0
duration : float, optional
Duration of each impulse response, by default 1.0
"""
# def _instantiate(self, state: RandomState, signal: AudioSignal = None):
# eq_amount = util.sample_from_dist(self.eq_amount, state)
# eq = -eq_amount * state.rand(self.n_bands)
# drr = util.sample_from_dist(self.drr, state)
def __init__(
self,
drr: tuple=("uniform", 0.0, 30.0),
sources: List[str]=None,
weights: List[float]=None,
eq_amount: tuple=("const", 1.0),
n_bands: int=6,
name: str=None,
prob: float=1.0,
use_original_phase: bool=False,
offset: float=0.0,
duration: float=1.0, ):
super().__init__(name=name, prob=prob)
# ir_signal = self.loader(
# state,
# signal.sample_rate,
# offset=self.offset,
# duration=self.duration,
# loudness_cutoff=None,
# num_channels=signal.num_channels,
# )["signal"]
# ir_signal.zero_pad_to(signal.sample_rate)
self.drr = drr
self.eq_amount = eq_amount
self.n_bands = n_bands
self.use_original_phase = use_original_phase
# return {"eq": eq, "ir_signal": ir_signal, "drr": drr}
self.loader = AudioLoader(sources, weights)
self.offset = offset
self.duration = duration
# def _transform(self, signal, ir_signal, drr, eq):
# # Clone ir_signal so that transform can be repeatedly applied
# # to different signals with the same effect.
# return signal.apply_ir(
# ir_signal.clone(), drr, eq, use_original_phase=self.use_original_phase
# )
def _instantiate(self, state: RandomState, signal: AudioSignal=None):
eq_amount = util.sample_from_dist(self.eq_amount, state)
eq = -eq_amount * state.rand(self.n_bands)
drr = util.sample_from_dist(self.drr, state)
ir_signal = self.loader(
state,
signal.sample_rate,
offset=self.offset,
duration=self.duration,
loudness_cutoff=None,
num_channels=signal.num_channels, )["signal"]
ir_signal.zero_pad_to(signal.sample_rate)
return {"eq": eq, "ir_signal": ir_signal, "drr": drr}
def _transform(self, signal, ir_signal, drr, eq):
# Clone ir_signal so that transform can be repeatedly applied
# to different signals with the same effect.
return signal.apply_ir(
ir_signal.clone(),
drr,
eq,
use_original_phase=self.use_original_phase)
# class VolumeChange(BaseTransform):
@ -970,37 +966,36 @@ class ClippingDistortion(BaseTransform):
# return signal.volume_change(db)
# class VolumeNorm(BaseTransform):
# """Normalizes the volume of the excerpt to a specified decibel.
class VolumeNorm(BaseTransform):
"""Normalizes the volume of the excerpt to a specified decibel.
# Uses :py:func:`audiotools.core.effects.EffectMixin.normalize`.
Uses :py:func:`audiotools.core.effects.EffectMixin.normalize`.
# Parameters
# ----------
# db : tuple, optional
# dB to normalize signal to, by default ("const", -24)
# name : str, optional
# Name of this transform, used to identify it in the dictionary
# produced by ``self.instantiate``, by default None
# prob : float, optional
# Probability of applying this transform, by default 1.0
# """
Parameters
----------
db : tuple, optional
dB to normalize signal to, by default ("const", -24)
name : str, optional
Name of this transform, used to identify it in the dictionary
produced by ``self.instantiate``, by default None
prob : float, optional
Probability of applying this transform, by default 1.0
"""
# def __init__(
# self,
# db: tuple = ("const", -24),
# name: str = None,
# prob: float = 1.0,
# ):
# super().__init__(name=name, prob=prob)
def __init__(
self,
db: tuple=("const", -24),
name: str=None,
prob: float=1.0, ):
super().__init__(name=name, prob=prob)
# self.db = db
self.db = db
# def _instantiate(self, state: RandomState):
# return {"db": util.sample_from_dist(self.db, state)}
def _instantiate(self, state: RandomState):
return {"db": util.sample_from_dist(self.db, state)}
# def _transform(self, signal, db):
# return signal.normalize(db)
def _transform(self, signal, db):
return signal.normalize(db)
# class GlobalVolumeNorm(BaseTransform):
@ -1063,111 +1058,108 @@ class ClippingDistortion(BaseTransform):
# return signal.volume_change(db)
# class Silence(BaseTransform):
# """Zeros out the signal with some probability.
class Silence(BaseTransform):
"""Zeros out the signal with some probability.
# Parameters
# ----------
# name : str, optional
# Name of this transform, used to identify it in the dictionary
# produced by ``self.instantiate``, by default None
# prob : float, optional
# Probability of applying this transform, by default 0.1
# """
Parameters
----------
name : str, optional
Name of this transform, used to identify it in the dictionary
produced by ``self.instantiate``, by default None
prob : float, optional
Probability of applying this transform, by default 0.1
"""
# def __init__(self, name: str = None, prob: float = 0.1):
# super().__init__(name=name, prob=prob)
def __init__(self, name: str=None, prob: float=0.1):
super().__init__(name=name, prob=prob)
# def _transform(self, signal):
# _loudness = signal._loudness
# signal = AudioSignal(
# torch.zeros_like(signal.audio_data),
# sample_rate=signal.sample_rate,
# stft_params=signal.stft_params,
# )
# # So that the amound of noise added is as if it wasn't silenced.
# # TODO: improve this hack
# signal._loudness = _loudness
def _transform(self, signal):
_loudness = signal._loudness
signal = AudioSignal(
paddle.zeros_like(signal.audio_data),
sample_rate=signal.sample_rate,
stft_params=signal.stft_params, )
# So that the amound of noise added is as if it wasn't silenced.
# TODO: improve this hack
signal._loudness = _loudness
# return signal
return signal
# class LowPass(BaseTransform):
# """Applies a LowPass filter.
class LowPass(BaseTransform):
"""Applies a LowPass filter.
# Uses :py:func:`audiotools.core.dsp.DSPMixin.low_pass`.
Uses :py:func:`audiotools.core.dsp.DSPMixin.low_pass`.
# Parameters
# ----------
# cutoff : tuple, optional
# Cutoff frequency distribution,
# by default ``("choice", [4000, 8000, 16000])``
# zeros : int, optional
# Number of zero-crossings in filter, argument to
# ``julius.LowPassFilters``, by default 51
# name : str, optional
# Name of this transform, used to identify it in the dictionary
# produced by ``self.instantiate``, by default None
# prob : float, optional
# Probability of applying this transform, by default 1.0
# """
Parameters
----------
cutoff : tuple, optional
Cutoff frequency distribution,
by default ``("choice", [4000, 8000, 16000])``
zeros : int, optional
Number of zero-crossings in filter, argument to
``julius.LowPassFilters``, by default 51
name : str, optional
Name of this transform, used to identify it in the dictionary
produced by ``self.instantiate``, by default None
prob : float, optional
Probability of applying this transform, by default 1.0
"""
# def __init__(
# self,
# cutoff: tuple = ("choice", [4000, 8000, 16000]),
# zeros: int = 51,
# name: str = None,
# prob: float = 1,
# ):
# super().__init__(name=name, prob=prob)
def __init__(
self,
cutoff: tuple=("choice", [4000, 8000, 16000]),
zeros: int=51,
name: str=None,
prob: float=1, ):
super().__init__(name=name, prob=prob)
# self.cutoff = cutoff
# self.zeros = zeros
self.cutoff = cutoff
self.zeros = zeros
# def _instantiate(self, state: RandomState):
# return {"cutoff": util.sample_from_dist(self.cutoff, state)}
def _instantiate(self, state: RandomState):
return {"cutoff": util.sample_from_dist(self.cutoff, state)}
# def _transform(self, signal, cutoff):
# return signal.low_pass(cutoff, zeros=self.zeros)
def _transform(self, signal, cutoff):
return signal.low_pass(cutoff, zeros=self.zeros)
# class HighPass(BaseTransform):
# """Applies a HighPass filter.
class HighPass(BaseTransform):
"""Applies a HighPass filter.
# Uses :py:func:`audiotools.core.dsp.DSPMixin.high_pass`.
Uses :py:func:`audiotools.core.dsp.DSPMixin.high_pass`.
# Parameters
# ----------
# cutoff : tuple, optional
# Cutoff frequency distribution,
# by default ``("choice", [50, 100, 250, 500, 1000])``
# zeros : int, optional
# Number of zero-crossings in filter, argument to
# ``julius.LowPassFilters``, by default 51
# name : str, optional
# Name of this transform, used to identify it in the dictionary
# produced by ``self.instantiate``, by default None
# prob : float, optional
# Probability of applying this transform, by default 1.0
# """
Parameters
----------
cutoff : tuple, optional
Cutoff frequency distribution,
by default ``("choice", [50, 100, 250, 500, 1000])``
zeros : int, optional
Number of zero-crossings in filter, argument to
``julius.LowPassFilters``, by default 51
name : str, optional
Name of this transform, used to identify it in the dictionary
produced by ``self.instantiate``, by default None
prob : float, optional
Probability of applying this transform, by default 1.0
"""
# def __init__(
# self,
# cutoff: tuple = ("choice", [50, 100, 250, 500, 1000]),
# zeros: int = 51,
# name: str = None,
# prob: float = 1,
# ):
# super().__init__(name=name, prob=prob)
def __init__(
self,
cutoff: tuple=("choice", [50, 100, 250, 500, 1000]),
zeros: int=51,
name: str=None,
prob: float=1, ):
super().__init__(name=name, prob=prob)
# self.cutoff = cutoff
# self.zeros = zeros
self.cutoff = cutoff
self.zeros = zeros
# def _instantiate(self, state: RandomState):
# return {"cutoff": util.sample_from_dist(self.cutoff, state)}
def _instantiate(self, state: RandomState):
return {"cutoff": util.sample_from_dist(self.cutoff, state)}
# def _transform(self, signal, cutoff):
# return signal.high_pass(cutoff, zeros=self.zeros)
def _transform(self, signal, cutoff):
return signal.high_pass(cutoff, zeros=self.zeros)
# class RescaleAudio(BaseTransform):
@ -1196,7 +1188,6 @@ class ClippingDistortion(BaseTransform):
# def _transform(self, signal):
# return signal.ensure_max_of_audio(self.val)
# class ShiftPhase(SpectralTransform):
# """Shifts the phase of the audio.
@ -1228,7 +1219,6 @@ class ClippingDistortion(BaseTransform):
# def _transform(self, signal, shift):
# return signal.shift_phase(shift)
# class InvertPhase(ShiftPhase):
# """Inverts the phase of the audio.
@ -1246,7 +1236,6 @@ class ClippingDistortion(BaseTransform):
# def __init__(self, name: str = None, prob: float = 1):
# super().__init__(shift=("const", np.pi), name=name, prob=prob)
# class CorruptPhase(SpectralTransform):
# """Corrupts the phase of the audio.
@ -1277,7 +1266,6 @@ class ClippingDistortion(BaseTransform):
# def _transform(self, signal, corruption):
# return signal.shift_phase(shift=corruption)
# class FrequencyMask(SpectralTransform):
# """Masks a band of frequencies at a center frequency
# from the audio.
@ -1323,7 +1311,6 @@ class ClippingDistortion(BaseTransform):
# def _transform(self, signal, fmin_hz: float, fmax_hz: float):
# return signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz)
# class TimeMask(SpectralTransform):
# """Masks out contiguous time-steps from signal.
@ -1368,7 +1355,6 @@ class ClippingDistortion(BaseTransform):
# def _transform(self, signal, tmin_s: float, tmax_s: float):
# return signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s)
# class MaskLowMagnitudes(SpectralTransform):
# """Masks low magnitude regions out of signal.
@ -1401,7 +1387,6 @@ class ClippingDistortion(BaseTransform):
# def _transform(self, signal, db_cutoff: float):
# return signal.mask_low_magnitudes(db_cutoff)
# class Smoothing(BaseTransform):
# """Convolves the signal with a smoothing window.
@ -1452,7 +1437,6 @@ class ClippingDistortion(BaseTransform):
# out = out * (sscale / oscale)
# return out
# class TimeNoise(TimeMask):
# """Similar to :py:func:`audiotools.data.transforms.TimeMask`, but
# replaces with noise instead of zeros.
@ -1494,7 +1478,6 @@ class ClippingDistortion(BaseTransform):
# signal.phase = phase
# return signal
# class FrequencyNoise(FrequencyMask):
# """Similar to :py:func:`audiotools.data.transforms.FrequencyMask`, but
# replaces with noise instead of zeros.
@ -1535,7 +1518,6 @@ class ClippingDistortion(BaseTransform):
# signal.phase = phase
# return signal
# class SpectralDenoising(Equalizer):
# """Applies denoising algorithm detailed in
# :py:func:`audiotools.ml.layers.spectral_gate.SpectralGate`,

@ -2,7 +2,8 @@ import os
import numpy as np
import paddle
from audio_signal import AudioSignal
from ..core import AudioSignal
def visqol(

@ -3,7 +3,9 @@ import typing
import paddle
import paddle.distributed as dist
from paddle.io import DataLoader, DistributedBatchSampler, SequentialSampler
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddle.io import SequenceSampler
class ResumableDistributedSampler(DistributedBatchSampler): # pragma: no cover
@ -21,7 +23,7 @@ class ResumableDistributedSampler(DistributedBatchSampler): # pragma: no cover
self.start_idx = 0 # set the index back to 0 so for the next epoch
class ResumableSequentialSampler(SequentialSampler): # pragma: no cover
class ResumableSequentialSampler(SequenceSampler): # pragma: no cover
"""Sequential sampler that can be resumed from a given start index."""
def __init__(self, dataset, start_idx: int=None, **kwargs):
@ -145,7 +147,10 @@ class Accelerator: # pragma: no cover
# https://www.paddlepaddle.org.cn/documentation/docs/zh/2.6/api/paddle/amp/GradScaler_cn.html#step-optimizer
self.scaler.update()
def prepare_dataloader(self, dataset: typing.Iterable, start_idx: int = None, **kwargs):
def prepare_dataloader(self,
dataset: typing.Iterable,
start_idx: int=None,
**kwargs):
"""Wraps a dataset with a DataLoader, using the correct sampler if DDP is
enabled.
@ -171,10 +176,10 @@ class Accelerator: # pragma: no cover
shuffle=kwargs.get("shuffle", True),
drop_last=kwargs.get("drop_last", False),
num_replicas=self.world_size,
rank=self.local_rank,
)
rank=self.local_rank, )
if "num_workers" in kwargs:
kwargs["num_workers"] = max(kwargs["num_workers"] // self.world_size, 1)
kwargs["num_workers"] = max(kwargs["num_workers"] //
self.world_size, 1)
else:
sampler = ResumableSequentialSampler(dataset, start_idx)
@ -182,8 +187,7 @@ class Accelerator: # pragma: no cover
dataset,
batch_sampler=sampler if self.use_ddp else None,
sampler=sampler if not self.use_ddp else None,
**kwargs,
)
**kwargs, )
return dataloader
@staticmethod

@ -106,7 +106,7 @@ class BaseModel(nn.Layer):
if not package:
state_dict = {"state_dict": self.state_dict(), "metadata": metadata}
paddle.save(state_dict, path)
paddle.save(state_dict, str(path))
else:
self._save_package(path, intern=intern, extern=extern, mock=mock)
@ -118,7 +118,7 @@ class BaseModel(nn.Layer):
the first parameter. May not be valid if model is split across
multiple devices.
"""
return list(self.parameters())[0].device
return list(self.parameters())[0].place
@classmethod
def load(
@ -152,7 +152,7 @@ class BaseModel(nn.Layer):
try:
model = cls._load_package(location, package_name=package_name)
except:
model_dict = paddle.load(location, "cpu")
model_dict = paddle.load(location)
metadata = model_dict["metadata"]
metadata["kwargs"].update(kwargs)
@ -163,7 +163,7 @@ class BaseModel(nn.Layer):
metadata["kwargs"].pop(k)
model = cls(*args, **metadata["kwargs"])
model.load_state_dict(model_dict["state_dict"], strict=strict)
model.set_state_dict(model_dict["state_dict"])
model.metadata = metadata
return model
@ -220,7 +220,7 @@ class BaseModel(nn.Layer):
self.save(weights_path, package=False)
for path, obj in extra_data.items():
paddle.save(obj, target_base / path)
paddle.save(obj, str(target_base / path))
return target_base
@ -257,7 +257,7 @@ class BaseModel(nn.Layer):
model_pth = "package.pth" if package else "weights.pth"
model_pth = folder / model_pth
model = cls.load(model_pth, strict=strict)
model = cls.load(str(model_pth))
extra_data = {}
excluded = ["package.pth", "weights.pth"]
files = [
@ -265,6 +265,6 @@ class BaseModel(nn.Layer):
if x.is_file() and x.name not in excluded
]
for f in files:
extra_data[f.name] = paddle.load(f, **kwargs)
extra_data[f.name] = paddle.load(str(f), **kwargs)
return model, extra_data

@ -1,5 +0,0 @@
soundfile
librosa
scipy
rich
flatten_dict
Loading…
Cancel
Save