fix codestyle

pull/3900/head
drryanhuang 10 months ago
parent 9e7dca2bc5
commit 1726e2fdfc

@ -8,13 +8,12 @@ import typing
import warnings
from collections import namedtuple
from pathlib import Path
from typing import Optional
import librosa
import numpy as np
import soundfile
import paddle
import librosa
from typing import Optional
import soundfile
import util
from resample import resample_frac
@ -48,9 +47,8 @@ def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> paddle.Tensor:
# http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II
n = paddle.arange(float(n_mels))
k = paddle.arange(float(n_mfcc)).unsqueeze([1])
dct = paddle.cos(
math.pi / float(n_mels) * (n + 0.5) * k
) # size (n_mfcc, n_mels)
dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
k) # size (n_mfcc, n_mels)
if norm is None:
dct *= 2.0
@ -68,8 +66,7 @@ STFTParams = namedtuple(
"window_type",
"match_stride",
"padding_type",
],
)
], )
"""
STFTParams object is a container that holds STFT parameters - window_length,
hop_length, and window_type. Not all parameters need to be specified. Ones that
@ -92,14 +89,14 @@ STFTParams.__new__.__defaults__ = (None, None, None, None, None)
class AudioSignal(
# EffectMixin,
# LoudnessMixin,
# PlayMixin,
# ImpulseResponseMixin,
# DSPMixin,
# DisplayMixin,
# FFMPEGMixin,
# WhisperMixin,
# EffectMixin,
# LoudnessMixin,
# PlayMixin,
# ImpulseResponseMixin,
# DSPMixin,
# DisplayMixin,
# FFMPEGMixin,
# WhisperMixin,
):
"""This is the core object of this library. Audio is always
loaded into an AudioSignal, which then enables all the features
@ -161,14 +158,14 @@ class AudioSignal(
"""
def __init__(
self,
audio_path_or_array: typing.Union[paddle.Tensor, str, Path, np.ndarray],
sample_rate: int = None,
stft_params: STFTParams = None,
offset: float = 0,
duration: float = None,
device: str = None,
):
self,
audio_path_or_array: typing.Union[paddle.Tensor, str, Path,
np.ndarray],
sample_rate: int=None,
stft_params: STFTParams=None,
offset: float=0,
duration: float=None,
device: str=None, ):
# ✅
audio_path = None
audio_array = None
@ -182,10 +179,8 @@ class AudioSignal(
elif paddle.is_tensor(audio_path_or_array):
audio_array = audio_path_or_array
else:
raise ValueError(
"audio_path_or_array must be either a Path, "
"string, numpy array, or paddle Tensor!"
)
raise ValueError("audio_path_or_array must be either a Path, "
"string, numpy array, or paddle Tensor!")
self.path_to_file = None
@ -194,8 +189,7 @@ class AudioSignal(
self.stft_data = None
if audio_path is not None:
self.load_from_file(
audio_path, offset=offset, duration=duration, device=device
)
audio_path, offset=offset, duration=duration, device=device)
elif audio_array is not None:
assert sample_rate is not None, "Must set sample rate!"
self.load_from_array(audio_array, sample_rate, device=device)
@ -210,8 +204,7 @@ class AudioSignal(
@property
def path_to_input_file(
self,
):
self, ):
"""
Path to input file, if it exists.
Alias to ``path_to_file`` for backwards compatibility
@ -220,13 +213,12 @@ class AudioSignal(
@classmethod
def excerpt(
cls,
audio_path: typing.Union[str, Path],
offset: float = None,
duration: float = None,
state: typing.Union[np.random.RandomState, int] = None,
**kwargs,
):
cls,
audio_path: typing.Union[str, Path],
offset: float=None,
duration: float=None,
state: typing.Union[np.random.RandomState, int]=None,
**kwargs, ):
"""✅Randomly draw an excerpt of ``duration`` seconds from an
audio file specified at ``audio_path``, between ``offset`` seconds
and end of file. ``state`` can be used to seed the random draw.
@ -268,13 +260,12 @@ class AudioSignal(
@classmethod
def salient_excerpt(
cls,
audio_path: typing.Union[str, Path],
loudness_cutoff: float = None,
num_tries: int = 8,
state: typing.Union[np.random.RandomState, int] = None,
**kwargs,
):
cls,
audio_path: typing.Union[str, Path],
loudness_cutoff: float=None,
num_tries: int=8,
state: typing.Union[np.random.RandomState, int]=None,
**kwargs, ):
"""❌Similar to AudioSignal.excerpt, except it extracts excerpts only
if they are above a specified loudness threshold, which is computed via
a fast LUFS routine.
@ -329,13 +320,12 @@ class AudioSignal(
@classmethod
def zeros(
cls,
duration: float,
sample_rate: int,
num_channels: int = 1,
batch_size: int = 1,
**kwargs,
):
cls,
duration: float,
sample_rate: int,
num_channels: int=1,
batch_size: int=1,
**kwargs, ):
"""✅Helper function create an AudioSignal of all zeros.
Parameters
@ -364,19 +354,17 @@ class AudioSignal(
return cls(
paddle.zeros([batch_size, num_channels, n_samples]),
sample_rate,
**kwargs,
)
**kwargs, )
@classmethod
def wave(
cls,
frequency: float,
duration: float,
sample_rate: int,
num_channels: int = 1,
shape: str = "sine",
**kwargs,
):
cls,
frequency: float,
duration: float,
sample_rate: int,
num_channels: int=1,
shape: str="sine",
**kwargs, ):
"""
Generate a waveform of a given frequency and shape.
@ -423,13 +411,12 @@ class AudioSignal(
@classmethod
def batch(
cls,
audio_signals: list,
pad_signals: bool = False,
truncate_signals: bool = False,
resample: bool = False,
dim: int = 0,
):
cls,
audio_signals: list,
pad_signals: bool=False,
truncate_signals: bool=False,
resample: bool=False,
dim: int=0, ):
"""✅Creates a batched AudioSignal from a list of AudioSignals.
Parameters
@ -500,29 +487,25 @@ class AudioSignal(
raise RuntimeError(
f"Not all signals had the same length! Got {signal_lengths}. "
f"All signals must be the same length, or pad_signals/truncate_signals "
f"must be True. "
)
f"must be True. ")
# Concatenate along the specified dimension (default 0)
audio_data = paddle.concat(
[x.audio_data for x in audio_signals], axis=dim
)
[x.audio_data for x in audio_signals], axis=dim)
audio_paths = [x.path_to_file for x in audio_signals]
batched_signal = cls(
audio_data,
sample_rate=audio_signals[0].sample_rate,
)
sample_rate=audio_signals[0].sample_rate, )
batched_signal.path_to_file = audio_paths
return batched_signal
# I/O
def load_from_file(
self,
audio_path: typing.Union[str, Path],
offset: float,
duration: float,
device: str = "cpu",
):
self,
audio_path: typing.Union[str, Path],
offset: float,
duration: float,
device: str="cpu", ):
"""✅Loads data from file. Used internally when AudioSignal
is instantiated with a path to a file.
@ -548,8 +531,7 @@ class AudioSignal(
offset=offset,
duration=duration,
sr=None,
mono=False,
)
mono=False, )
data = util.ensure_tensor(data)
if data.shape[-1] == 0:
raise RuntimeError(
@ -569,11 +551,10 @@ class AudioSignal(
return self.to(device)
def load_from_array(
self,
audio_array: typing.Union[paddle.Tensor, np.ndarray],
sample_rate: int,
device: str = "cpu",
):
self,
audio_array: typing.Union[paddle.Tensor, np.ndarray],
sample_rate: int,
device: str="cpu", ):
"""✅Loads data from array, reshaping it to be exactly 3
dimensions. Used internally when AudioSignal is called
with a tensor or an array.
@ -646,8 +627,7 @@ class AudioSignal(
if self.audio_data[0].abs().max() > 1:
warnings.warn("Audio amplitude > 1 clipped when saving")
soundfile.write(
str(audio_path), self.audio_data[0].numpy().T, self.sample_rate
)
str(audio_path), self.audio_data[0].numpy().T, self.sample_rate)
self.path_to_file = audio_path
return self
@ -689,8 +669,7 @@ class AudioSignal(
clone = type(self)(
self.audio_data.clone(),
self.sample_rate,
stft_params=self.stft_params,
)
stft_params=self.stft_params, )
if self.stft_data is not None:
clone.stft_data = self.stft_data.clone()
if self._loudness is not None:
@ -777,9 +756,8 @@ class AudioSignal(
"""
if sample_rate == self.sample_rate:
return self
self.audio_data = resample_frac(
self.audio_data, self.sample_rate, sample_rate
)
self.audio_data = resample_frac(self.audio_data, self.sample_rate,
sample_rate)
self.sample_rate = sample_rate
return self
@ -861,11 +839,10 @@ class AudioSignal(
AudioSignal with padding applied.
"""
self.audio_data = paddle.nn.functional.pad(
self.audio_data, (before, after), data_format="NCL"
)
self.audio_data, (before, after), data_format="NCL")
return self
def zero_pad_to(self, length: int, mode: str = "after"):
def zero_pad_to(self, length: int, mode: str="after"):
"""✅Pad with zeros to a specified length, either before or after
the audio data.
@ -990,10 +967,8 @@ class AudioSignal(
def stft_data(self, data: typing.Union[paddle.Tensor, np.ndarray]):
if data is not None:
assert paddle.is_tensor(data) and paddle.is_complex(data)
if (
self.stft_data is not None
and self.stft_data.shape != data.shape
):
if (self.stft_data is not None and
self.stft_data.shape != data.shape):
warnings.warn("stft_data changed shape")
self._stft_data = data
return
@ -1062,7 +1037,7 @@ class AudioSignal(
# STFT
@staticmethod
@functools.lru_cache(None)
def get_window(window_type: str, window_length: int, device: str = None):
def get_window(window_type: str, window_length: int, device: str=None):
"""✅Wrapper around scipy.signal.get_window so one can also get the
popular sqrt-hann window. This function caches for efficiency
using functools.lru\_cache.
@ -1118,7 +1093,7 @@ class AudioSignal(
@stft_params.setter
def stft_params(self, value: STFTParams):
# ✅
default_win_len = int(2 ** (np.ceil(np.log2(0.032 * self.sample_rate))))
default_win_len = int(2**(np.ceil(np.log2(0.032 * self.sample_rate))))
default_hop_len = default_win_len // 4
default_win_type = "hann"
default_match_stride = False
@ -1129,8 +1104,7 @@ class AudioSignal(
hop_length=default_hop_len,
window_type=default_win_type,
match_stride=default_match_stride,
padding_type=default_padding_type,
)._asdict()
padding_type=default_padding_type, )._asdict()
value = value._asdict() if value else default_stft_params
@ -1141,9 +1115,10 @@ class AudioSignal(
self._stft_params = STFTParams(**value)
self.stft_data = None
def compute_stft_padding(
self, window_length: int, hop_length: int, match_stride: bool
):
def compute_stft_padding(self,
window_length: int,
hop_length: int,
match_stride: bool):
"""✅Compute how the STFT should be padded, based on match\_stride.
Parameters
@ -1164,9 +1139,8 @@ class AudioSignal(
length = self.signal_length
if match_stride:
assert (
hop_length == window_length // 4
), "For match_stride, hop must equal n_fft // 4"
assert (hop_length == window_length //
4), "For match_stride, hop must equal n_fft // 4"
right_pad = math.ceil(length / hop_length) * hop_length - length
pad = (window_length - hop_length) // 2
else:
@ -1176,13 +1150,12 @@ class AudioSignal(
return right_pad, pad
def stft(
self,
window_length: int = None,
hop_length: int = None,
window_type: str = None,
match_stride: bool = None,
padding_type: str = None,
):
self,
window_length: int=None,
hop_length: int=None,
window_type: str=None,
match_stride: bool=None,
padding_type: str=None, ):
"""✅Computes the short-time Fourier transform of the audio data,
with specified STFT parameters.
@ -1219,55 +1192,38 @@ class AudioSignal(
>>> signal.stft()
"""
window_length = (
self.stft_params.window_length
if window_length is None
else int(window_length)
)
hop_length = (
self.stft_params.hop_length
if hop_length is None
else int(hop_length)
)
window_type = (
self.stft_params.window_type if window_type is None else window_type
)
match_stride = (
self.stft_params.match_stride
if match_stride is None
else match_stride
)
padding_type = (
self.stft_params.padding_type
if padding_type is None
else padding_type
)
window_length = (self.stft_params.window_length
if window_length is None else int(window_length))
hop_length = (self.stft_params.hop_length
if hop_length is None else int(hop_length))
window_type = (self.stft_params.window_type
if window_type is None else window_type)
match_stride = (self.stft_params.match_stride
if match_stride is None else match_stride)
padding_type = (self.stft_params.padding_type
if padding_type is None else padding_type)
window = self.get_window(window_type, window_length)
# window = window.to(self.audio_data.device)
audio_data = self.audio_data
right_pad, pad = self.compute_stft_padding(
window_length, hop_length, match_stride
)
right_pad, pad = self.compute_stft_padding(window_length, hop_length,
match_stride)
audio_data = paddle.nn.functional.pad(
x=audio_data,
pad=[pad, pad + right_pad],
mode="reflect",
data_format="NCL",
)
data_format="NCL", )
stft_data = paddle.signal.stft(
audio_data.reshape([-1, audio_data.shape[-1]]),
n_fft=window_length,
hop_length=hop_length,
window=window,
# return_complex=True,
center=True,
)
center=True, )
_, nf, nt = stft_data.shape
stft_data = stft_data.reshape(
[self.batch_size, self.num_channels, nf, nt]
)
[self.batch_size, self.num_channels, nf, nt])
if match_stride:
# Drop first two and last two frames, which are added
@ -1278,13 +1234,12 @@ class AudioSignal(
return stft_data
def istft(
self,
window_length: int = None,
hop_length: int = None,
window_type: str = None,
match_stride: bool = None,
length: int = None,
):
self,
window_length: int=None,
hop_length: int=None,
window_type: str=None,
match_stride: bool=None,
length: int=None, ):
"""✅Computes inverse STFT and sets it to audio\_data.
Parameters
@ -1314,34 +1269,22 @@ class AudioSignal(
if self.stft_data is None:
raise RuntimeError("Cannot do inverse STFT without self.stft_data!")
window_length = (
self.stft_params.window_length
if window_length is None
else int(window_length)
)
hop_length = (
self.stft_params.hop_length
if hop_length is None
else int(hop_length)
)
window_type = (
self.stft_params.window_type if window_type is None else window_type
)
match_stride = (
self.stft_params.match_stride
if match_stride is None
else match_stride
)
window = self.get_window(
window_type, window_length, self.stft_data.place
)
window_length = (self.stft_params.window_length
if window_length is None else int(window_length))
hop_length = (self.stft_params.hop_length
if hop_length is None else int(hop_length))
window_type = (self.stft_params.window_type
if window_type is None else window_type)
match_stride = (self.stft_params.match_stride
if match_stride is None else match_stride)
window = self.get_window(window_type, window_length,
self.stft_data.place)
nb, nch, nf, nt = self.stft_data.shape
stft_data = self.stft_data.reshape([nb * nch, nf, nt])
right_pad, pad = self.compute_stft_padding(
window_length, hop_length, match_stride
)
right_pad, pad = self.compute_stft_padding(window_length, hop_length,
match_stride)
if length is None:
length = self.original_signal_length
@ -1351,8 +1294,7 @@ class AudioSignal(
# Zero-pad the STFT on either side, putting back the frames that were
# dropped in stft().
stft_data = paddle.nn.functional.pad(
stft_data, pad=(2, 2), data_format="NCL"
)
stft_data, pad=(2, 2), data_format="NCL")
audio_data = paddle.signal.istft(
stft_data,
@ -1360,20 +1302,21 @@ class AudioSignal(
hop_length=hop_length,
window=window,
length=length,
center=True,
)
center=True, )
audio_data = audio_data.reshape([nb, nch, -1])
if match_stride:
audio_data = audio_data[..., pad : -(pad + right_pad)]
audio_data = audio_data[..., pad:-(pad + right_pad)]
self.audio_data = audio_data
return self
@staticmethod
@functools.lru_cache(None)
def get_mel_filters(
sr: int, n_fft: int, n_mels: int, fmin: float = 0.0, fmax: float = None
):
def get_mel_filters(sr: int,
n_fft: int,
n_mels: int,
fmin: float=0.0,
fmax: float=None):
"""✅Create a Filterbank matrix to combine FFT bins into Mel-frequency bins.
Parameters
@ -1401,16 +1344,14 @@ class AudioSignal(
n_fft=n_fft,
n_mels=n_mels,
fmin=fmin,
fmax=fmax,
)
fmax=fmax, )
def mel_spectrogram(
self,
n_mels: int = 80,
mel_fmin: float = 0.0,
mel_fmax: float = None,
**kwargs,
):
self,
n_mels: int=80,
mel_fmin: float=0.0,
mel_fmax: float=None,
**kwargs, ):
"""✅Computes a Mel spectrogram.
Parameters
@ -1438,8 +1379,7 @@ class AudioSignal(
n_fft=2 * (nf - 1),
n_mels=n_mels,
fmin=mel_fmin,
fmax=mel_fmax,
)
fmax=mel_fmax, )
mel_basis = paddle.to_tensor(mel_basis)
mel_spectrogram = magnitude.transpose([0, 1, 3, 2]) @ mel_basis.T
@ -1448,9 +1388,7 @@ class AudioSignal(
@staticmethod
@functools.lru_cache(None)
def get_dct(
n_mfcc: int, n_mels: int, norm: str = "ortho", device: str = None
):
def get_dct(n_mfcc: int, n_mels: int, norm: str="ortho", device: str=None):
"""✅Create a discrete cosine transform (DCT) transformation matrix with shape (``n_mels``, ``n_mfcc``),
it can be normalized depending on norm. For more information about dct:
http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II
@ -1476,12 +1414,11 @@ class AudioSignal(
return create_dct(n_mfcc, n_mels, norm)
def mfcc(
self,
n_mfcc: int = 40,
n_mels: int = 80,
log_offset: float = 1e-6,
**kwargs,
):
self,
n_mfcc: int=40,
n_mels: int=80,
log_offset: float=1e-6,
**kwargs, ):
"""✅Computes mel-frequency cepstral coefficients (MFCCs).
Parameters
@ -1538,9 +1475,10 @@ class AudioSignal(
self.stft_data = value * paddle.exp(1j * self.phase)
return
def log_magnitude(
self, ref_value: float = 1.0, amin: float = 1e-5, top_db: float = 80.0
):
def log_magnitude(self,
ref_value: float=1.0,
amin: float=1e-5,
top_db: float=80.0):
"""✅Computes the log-magnitude of the spectrogram.
Parameters
@ -1637,22 +1575,25 @@ class AudioSignal(
# Representation
def _info(self):
# ✅
dur = (
f"{self.signal_duration:0.3f}"
if self.signal_duration
else "[unknown]"
)
dur = (f"{self.signal_duration:0.3f}"
if self.signal_duration else "[unknown]")
info = {
"duration": f"{dur} seconds",
"batch_size": self.batch_size,
"path": self.path_to_file if self.path_to_file else "path unknown",
"sample_rate": self.sample_rate,
"num_channels": (
self.num_channels if self.num_channels else "[unknown]"
),
"audio_data.shape": self.audio_data.shape,
"stft_params": self.stft_params,
"device": self.device,
"duration":
f"{dur} seconds",
"batch_size":
self.batch_size,
"path":
self.path_to_file if self.path_to_file else "path unknown",
"sample_rate":
self.sample_rate,
"num_channels": (self.num_channels
if self.num_channels else "[unknown]"),
"audio_data.shape":
self.audio_data.shape,
"stft_params":
self.stft_params,
"device":
self.device,
}
return info
@ -1728,25 +1669,21 @@ class AudioSignal(
stft_data = self.stft_data
elif isinstance(key, (bool, int, list, slice, tuple)) or (
paddle.is_tensor(key) and key.ndim <= 1
):
paddle.is_tensor(key) and key.ndim <= 1):
# Indexing only on the batch dimension.
# Then let's copy over relevant stuff.
# Future work: make this work for time-indexing
# as well, using the hop length.
audio_data = self.audio_data[key]
_loudness = (
self._loudness[key] if self._loudness is not None else None
)
stft_data = (
self.stft_data[key] if self.stft_data is not None else None
)
_loudness = (self._loudness[key]
if self._loudness is not None else None)
stft_data = (self.stft_data[key]
if self.stft_data is not None else None)
sources = None
copy = type(self)(
audio_data, self.sample_rate, stft_params=self.stft_params
)
audio_data, self.sample_rate, stft_params=self.stft_params)
copy._loudness = _loudness
copy._stft_data = stft_data
copy.sources = sources
@ -1766,8 +1703,7 @@ class AudioSignal(
return
elif isinstance(key, (bool, int, list, slice, tuple)) or (
paddle.is_tensor(key) and key.ndim <= 1
):
paddle.is_tensor(key) and key.ndim <= 1):
if self.audio_data is not None and value.audio_data is not None:
self.audio_data[key] = value.audio_data
if self._loudness is not None and value._loudness is not None:

@ -1,13 +1,13 @@
import inspect
from typing import Optional, Sequence
import math
from typing import Optional
from typing import Sequence
import paddle
import paddle.nn.functional as F
import math
def simple_repr(
obj, attrs: Optional[Sequence[str]] = None, overrides: dict = {}
):
def simple_repr(obj, attrs: Optional[Sequence[str]]=None, overrides: dict={}):
"""
Return a simple representation string for `obj`.
If `attrs` is not None, it should be a list of attributes to include.
@ -45,8 +45,7 @@ def sinc(x: paddle.Tensor):
return paddle.where(
x == 0,
paddle.to_tensor(1.0, dtype=x.dtype, place=x.place),
paddle.sin(x) / x,
)
paddle.sin(x) / x, )
class ResampleFrac(paddle.nn.Layer):
@ -54,9 +53,11 @@ class ResampleFrac(paddle.nn.Layer):
Resampling from the sample rate `old_sr` to `new_sr`.
"""
def __init__(
self, old_sr: int, new_sr: int, zeros: int = 24, rolloff: float = 0.945
):
def __init__(self,
old_sr: int,
new_sr: int,
zeros: int=24,
rolloff: float=0.945):
"""
Args:
old_sr (int): sample rate of the input signal x.
@ -129,13 +130,12 @@ class ResampleFrac(paddle.nn.Layer):
# There is probably a way to evaluate those filters more efficiently, but this is kept for
# future work.
idx = paddle.arange(
-self._width, self._width + self.old_sr, dtype="float32"
)
-self._width, self._width + self.old_sr, dtype="float32")
for i in range(self.new_sr):
t = (-i / self.new_sr + idx / self.old_sr) * sr
t = paddle.clip(t, -self.zeros, self.zeros)
t *= math.pi
window = paddle.cos(t / self.zeros / 2) ** 2
window = paddle.cos(t / self.zeros / 2)**2
kernel = sinc(t) * window
# Renormalize kernel to ensure a constant signal is preserved.
kernel = kernel / kernel.sum()
@ -144,16 +144,14 @@ class ResampleFrac(paddle.nn.Layer):
_kernel = paddle.stack(kernels).reshape([self.new_sr, 1, -1])
self.kernel = self.create_parameter(
shape=_kernel.shape,
dtype=_kernel.dtype,
)
dtype=_kernel.dtype, )
self.kernel.set_value(_kernel)
def forward(
self,
x: paddle.Tensor,
output_length: Optional[int] = None,
full: bool = False,
):
self,
x: paddle.Tensor,
output_length: Optional[int]=None,
full: bool=False, ):
"""
Resample x.
Args:
@ -176,35 +174,29 @@ class ResampleFrac(paddle.nn.Layer):
x.unsqueeze(1),
[self._width, self._width + self.old_sr],
mode="replicate",
data_format="NCL",
)
data_format="NCL", )
ys = F.conv1d(x, self.kernel, stride=self.old_sr, data_format="NCL")
y = ys.transpose([0, 2, 1]).reshape(list(shape[:-1]) + [-1])
float_output_length = paddle.to_tensor(
self.new_sr * length / self.old_sr, dtype="float32"
)
self.new_sr * length / self.old_sr, dtype="float32")
max_output_length = paddle.ceil(float_output_length).astype("int64")
default_output_length = paddle.floor(float_output_length).astype(
"int64"
)
"int64")
if output_length is None:
applied_output_length = (
max_output_length if full else default_output_length
)
applied_output_length = (max_output_length
if full else default_output_length)
elif output_length < 0 or output_length > max_output_length:
raise ValueError(
f"output_length must be between 0 and {max_output_length.numpy()}"
)
else:
applied_output_length = paddle.to_tensor(
output_length, dtype="int64"
)
output_length, dtype="int64")
if full:
raise ValueError(
"You cannot pass both full=True and output_length"
)
"You cannot pass both full=True and output_length")
return y[..., :applied_output_length]
def __repr__(self):
@ -212,14 +204,13 @@ class ResampleFrac(paddle.nn.Layer):
def resample_frac(
x: paddle.Tensor,
old_sr: int,
new_sr: int,
zeros: int = 24,
rolloff: float = 0.945,
output_length: Optional[int] = None,
full: bool = False,
):
x: paddle.Tensor,
old_sr: int,
new_sr: int,
zeros: int=24,
rolloff: float=0.945,
output_length: Optional[int]=None,
full: bool=False, ):
"""
Functional version of `ResampleFrac`, refer to its documentation for more information.
@ -228,9 +219,7 @@ def resample_frac(
resampling kernel will be recomputed everytime. For best performance, you should use
and cache an instance of `ResampleFrac`.
"""
return ResampleFrac(old_sr, new_sr, zeros, rolloff)(
x, output_length, full
)
return ResampleFrac(old_sr, new_sr, zeros, rolloff)(x, output_length, full)
if __name__ == "__main__":

@ -5,14 +5,16 @@ import numbers
import os
import random
import typing
import soundfile
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional, List
from typing import Dict
from typing import List
from typing import Optional
import numpy as np
import paddle
import soundfile
from flatten_dict import flatten
from flatten_dict import unflatten
@ -43,10 +45,9 @@ def info(audio_path: str):
def ensure_tensor(
x: typing.Union[np.ndarray, paddle.Tensor, float, int],
ndim: int = None,
batch_size: int = None,
):
x: typing.Union[np.ndarray, paddle.Tensor, float, int],
ndim: int=None,
batch_size: int=None, ):
"""✅Ensures that the input ``x`` is a tensor of specified
dimensions and batch size.
@ -146,10 +147,8 @@ def random_state(seed: typing.Union[int, np.random.RandomState]):
elif isinstance(seed, np.random.RandomState):
return seed
else:
raise ValueError(
"%r cannot be used to seed a numpy.random.RandomState"
" instance" % seed
)
raise ValueError("%r cannot be used to seed a numpy.random.RandomState"
" instance" % seed)
def seed(random_seed, set_cudnn=False):
@ -214,7 +213,7 @@ def _close_temp_files(tmpfiles: list):
AUDIO_EXTENSIONS = [".wav", ".flac", ".mp3", ".mp4"]
def find_audio(folder: str, ext: List[str] = AUDIO_EXTENSIONS):
def find_audio(folder: str, ext: List[str]=AUDIO_EXTENSIONS):
"""Finds all audio files in a directory recursively.
Returns a list.
@ -244,11 +243,10 @@ def find_audio(folder: str, ext: List[str] = AUDIO_EXTENSIONS):
def read_sources(
sources: List[str],
remove_empty: bool = True,
relative_path: str = "",
ext: List[str] = AUDIO_EXTENSIONS,
):
sources: List[str],
remove_empty: bool=True,
relative_path: str="",
ext: List[str]=AUDIO_EXTENSIONS, ):
"""Reads audio sources that can either be folders
full of audio files, or CSV files that contain paths
to audio files. CSV files that adhere to the expected
@ -291,9 +289,9 @@ def read_sources(
return files
def choose_from_list_of_lists(
state: np.random.RandomState, list_of_lists: list, p: float = None
):
def choose_from_list_of_lists(state: np.random.RandomState,
list_of_lists: list,
p: float=None):
"""Choose a single item from a list of lists.
Parameters
@ -335,9 +333,8 @@ def chdir(newdir: typing.Union[Path, str]):
os.chdir(curdir)
def prepare_batch(
batch: typing.Union[dict, list, paddle.Tensor], device: str = "cpu"
):
def prepare_batch(batch: typing.Union[dict, list, paddle.Tensor],
device: str="cpu"):
"""Moves items in a batch (typically generated by a DataLoader as a list
or a dict) to the specified device. This works even if dictionaries
are nested.
@ -374,7 +371,7 @@ def prepare_batch(
return batch
def sample_from_dist(dist_tuple: tuple, state: np.random.RandomState = None):
def sample_from_dist(dist_tuple: tuple, state: np.random.RandomState=None):
"""Samples from a distribution defined by a tuple. The first
item in the tuple is the distribution type, and the rest of the
items are arguments to that distribution. The distribution function
@ -417,7 +414,7 @@ def sample_from_dist(dist_tuple: tuple, state: np.random.RandomState = None):
return dist_fn(*dist_tuple[1:])
def collate(list_of_dicts: list, n_splits: int = None):
def collate(list_of_dicts: list, n_splits: int=None):
"""Collates a list of dictionaries (e.g. as returned by a
dataloader) into a dictionary with batched values. This routine
uses the default paddle collate function for everything
@ -454,9 +451,10 @@ def collate(list_of_dicts: list, n_splits: int = None):
for i in range(0, list_len, n_items):
# Flatten the dictionaries to avoid recursion.
list_of_dicts_ = [flatten(d) for d in list_of_dicts[i : i + n_items]]
list_of_dicts_ = [flatten(d) for d in list_of_dicts[i:i + n_items]]
dict_of_lists = {
k: [dic[k] for dic in list_of_dicts_] for k in list_of_dicts_[0]
k: [dic[k] for dic in list_of_dicts_]
for k in list_of_dicts_[0]
}
batch = {}
@ -467,8 +465,7 @@ def collate(list_of_dicts: list, n_splits: int = None):
else:
# Borrow the default collate fn from paddle.
batch[k] = paddle.utils.data._utils.collate.default_collate(
v
)
v)
batches.append(unflatten(batch))
batches = batches[0] if not return_list else batches
@ -480,13 +477,12 @@ DEFAULT_FIG_SIZE = (9, 3)
def format_figure(
fig_size: tuple = None,
title: str = None,
fig=None,
format_axes: bool = True,
format: bool = True,
font_color: str = "white",
):
fig_size: tuple=None,
title: str=None,
fig=None,
format_axes: bool=True,
format: bool=True,
font_color: str="white", ):
"""Prettifies the spectrogram and waveform plots. A title
can be inset into the top right corner, and the axes can be
inset into the figure, allowing the data to take up the entire
@ -546,8 +542,7 @@ def format_figure(
va="top",
color=font_color,
fontsize=12 * font_scale,
alpha=0.75,
)
alpha=0.75, )
ticks = ax.get_xticks()[2:]
for t in ticks[:-1]:
@ -561,8 +556,7 @@ def format_figure(
va="bottom",
color=font_color,
fontsize=12 * font_scale,
alpha=0.75,
)
alpha=0.75, )
ax.margins(0, 0)
ax.set_axis_off()
@ -570,8 +564,7 @@ def format_figure(
ax.yaxis.set_major_locator(plt.NullLocator())
plt.subplots_adjust(
top=1, bottom=0, right=1, left=0, hspace=0, wspace=0
)
top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
if title is not None:
t = axs[0].annotate(
@ -583,20 +576,18 @@ def format_figure(
textcoords="offset points",
ha="right",
va="top",
color="white",
)
color="white", )
t.set_bbox(dict(facecolor="black", alpha=0.5, edgecolor="black"))
def generate_chord_dataset(
max_voices: int = 8,
sample_rate: int = 44100,
num_items: int = 5,
duration: float = 1.0,
min_note: str = "C2",
max_note: str = "C6",
output_dir: Path = "chords",
):
max_voices: int=8,
sample_rate: int=44100,
num_items: int=5,
duration: float=1.0,
min_note: str="C2",
max_note: str="C6",
output_dir: Path="chords", ):
"""
Generates a toy multitrack dataset of chords, synthesized from sine waves.
@ -640,8 +631,7 @@ def generate_chord_dataset(
frequency=librosa.midi_to_hz(midinote),
duration=dur,
sample_rate=sample_rate,
shape="sine",
)
shape="sine", )
track[f"voice_{voice_idx}"] = sig
tracks.append(track)

Loading…
Cancel
Save