add 2 file test

pull/3900/head
drryanhuang 9 months ago
parent 643f1c6071
commit 080bd7f5db

@ -17,14 +17,13 @@ import soundfile
from . import util
from ._julius import resample_frac
from .display import DisplayMixin
from .dsp import DSPMixin
from .effects import EffectMixin
from .effects import ImpulseResponseMixin
from .ffmpeg import FFMPEGMixin
from .loudness import LoudnessMixin
# from .display import DisplayMixin
# from .playback import PlayMixin
# from .whisper import WhisperMixin
@ -98,7 +97,7 @@ class AudioSignal(
# PlayMixin,
ImpulseResponseMixin,
DSPMixin,
# DisplayMixin,
DisplayMixin,
FFMPEGMixin,
# WhisperMixin,
):
@ -1498,6 +1497,8 @@ class AudioSignal(
amin = amin**2
log_spec = 10.0 * paddle.log10(magnitude.pow(2).clip(min=amin))
if paddle.is_tensor(ref_value):
ref_value = ref_value.item()
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
if top_db is not None:

@ -0,0 +1,191 @@
import inspect
import typing
from functools import wraps
from . import util
def format_figure(func):
"""Decorator for formatting figures produced by the code below.
See :py:func:`audiotools.core.util.format_figure` for more.
Parameters
----------
func : Callable
Plotting function that is decorated by this function.
"""
@wraps(func)
def wrapper(*args, **kwargs):
f_keys = inspect.signature(util.format_figure).parameters.keys()
f_kwargs = {}
for k, v in list(kwargs.items()):
if k in f_keys:
kwargs.pop(k)
f_kwargs[k] = v
func(*args, **kwargs)
util.format_figure(**f_kwargs)
return wrapper
class DisplayMixin:
@format_figure
def specshow(
self,
preemphasis: bool=False,
x_axis: str="time",
y_axis: str="linear",
n_mels: int=128,
**kwargs, ):
"""Displays a spectrogram, using ``librosa.display.specshow``.
Parameters
----------
preemphasis : bool, optional
Whether or not to apply preemphasis, which makes high
frequency detail easier to see, by default False
x_axis : str, optional
How to label the x axis, by default "time"
y_axis : str, optional
How to label the y axis, by default "linear"
n_mels : int, optional
If displaying a mel spectrogram with ``y_axis = "mel"``,
this controls the number of mels, by default 128.
kwargs : dict, optional
Keyword arguments to :py:func:`audiotools.core.util.format_figure`.
"""
import librosa
import librosa.display
# Always re-compute the STFT data before showing it, in case
# it changed.
signal = self.clone()
signal.stft_data = None
if preemphasis:
signal.preemphasis()
ref = signal.magnitude.max()
log_mag = signal.log_magnitude(ref_value=ref)
if y_axis == "mel":
log_mag = 20 * signal.mel_spectrogram(n_mels).clip(1e-5).log10()
log_mag -= log_mag.max()
librosa.display.specshow(
log_mag.numpy()[0].mean(axis=0),
x_axis=x_axis,
y_axis=y_axis,
sr=signal.sample_rate,
**kwargs, )
@format_figure
def waveplot(self, x_axis: str="time", **kwargs):
"""Displays a waveform plot, using ``librosa.display.waveshow``.
Parameters
----------
x_axis : str, optional
How to label the x axis, by default "time"
kwargs : dict, optional
Keyword arguments to :py:func:`audiotools.core.util.format_figure`.
"""
import librosa
import librosa.display
audio_data = self.audio_data[0].mean(axis=0)
audio_data = audio_data.cpu().numpy()
plot_fn = "waveshow" if hasattr(librosa.display,
"waveshow") else "waveplot"
wave_plot_fn = getattr(librosa.display, plot_fn)
wave_plot_fn(audio_data, x_axis=x_axis, sr=self.sample_rate, **kwargs)
@format_figure
def wavespec(self, x_axis: str="time", **kwargs):
"""Displays a waveform plot, using ``librosa.display.waveshow``.
Parameters
----------
x_axis : str, optional
How to label the x axis, by default "time"
kwargs : dict, optional
Keyword arguments to :py:func:`audiotools.core.display.DisplayMixin.specshow`.
"""
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
gs = GridSpec(6, 1)
plt.subplot(gs[0, :])
self.waveplot(x_axis=x_axis)
plt.subplot(gs[1:, :])
self.specshow(x_axis=x_axis, **kwargs)
def write_audio_to_tb(
self,
tag: str,
writer,
step: int=None,
plot_fn: typing.Union[typing.Callable, str]="specshow",
**kwargs, ):
"""Writes a signal and its spectrogram to Tensorboard. Will show up
under the Audio and Images tab in Tensorboard.
Parameters
----------
tag : str
Tag to write signal to (e.g. ``clean/sample_0.wav``). The image will be
written to the corresponding ``.png`` file (e.g. ``clean/sample_0.png``).
writer : SummaryWriter
A SummaryWriter object from PyTorch library.
step : int, optional
The step to write the signal to, by default None
plot_fn : typing.Union[typing.Callable, str], optional
How to create the image. Set to ``None`` to avoid plotting, by default "specshow"
kwargs : dict, optional
Keyword arguments to :py:func:`audiotools.core.display.DisplayMixin.specshow` or
whatever ``plot_fn`` is set to.
"""
import matplotlib.pyplot as plt
audio_data = self.audio_data[0, 0].detach().cpu().numpy()
sample_rate = self.sample_rate
writer.add_audio(tag, audio_data, step, sample_rate)
if plot_fn is not None:
if isinstance(plot_fn, str):
plot_fn = getattr(self, plot_fn)
fig = plt.figure()
plt.clf()
plot_fn(**kwargs)
writer.add_figure(tag.replace("wav", "png"), fig, step)
def save_image(
self,
image_path: str,
plot_fn: typing.Union[typing.Callable, str]="specshow",
**kwargs, ):
"""Save AudioSignal spectrogram (or whatever ``plot_fn`` is set to) to
a specified file.
Parameters
----------
image_path : str
Where to save the file to.
plot_fn : typing.Union[typing.Callable, str], optional
How to create the image. Set to ``None`` to avoid plotting, by default "specshow"
kwargs : dict, optional
Keyword arguments to :py:func:`audiotools.core.display.DisplayMixin.specshow` or
whatever ``plot_fn`` is set to.
"""
import matplotlib.pyplot as plt
if isinstance(plot_fn, str):
plot_fn = getattr(self, plot_fn)
plt.clf()
plot_fn(**kwargs)
plt.savefig(image_path, bbox_inches="tight", pad_inches=0)
plt.close()

@ -7,148 +7,201 @@ from . import _julius
from . import util
def _unfold(x, kernel_sizes, strides):
# https://github.com/PaddlePaddle/Paddle/pull/70102
if 1 == kernel_sizes[0]:
x_zeros = paddle.zeros_like(x)
x = paddle.concat([x, x_zeros], axis=2)
kernel_sizes = (2, kernel_sizes[1])
unfolded = paddle.nn.functional.unfold(
x,
kernel_sizes=kernel_sizes,
strides=strides, )
if 2 == kernel_sizes[0]:
unfolded = unfolded[:, :kernel_sizes[1]]
return unfolded
def _fold(x, output_sizes, kernel_sizes, strides):
# https://github.com/PaddlePaddle/Paddle/pull/70102
if 1 == output_sizes[0] and 1 == kernel_sizes[0]:
x_zeros = paddle.zeros_like(x)
x = paddle.concat([x, x_zeros], axis=1)
output_sizes = (2, output_sizes[1])
kernel_sizes = (2, kernel_sizes[1])
fold = paddle.nn.functional.fold(
x,
output_sizes=output_sizes,
kernel_sizes=kernel_sizes,
strides=strides, )
if 2 == kernel_sizes[0]:
fold = fold[:, :, :1]
return fold
class DSPMixin:
_original_batch_size = None
_original_num_channels = None
_padded_signal_length = None
# def _preprocess_signal_for_windowing(self, window_duration, hop_duration):
# self._original_batch_size = self.batch_size
# self._original_num_channels = self.num_channels
# window_length = int(window_duration * self.sample_rate)
# hop_length = int(hop_duration * self.sample_rate)
# if window_length % hop_length != 0:
# factor = window_length // hop_length
# window_length = factor * hop_length
# self.zero_pad(hop_length, hop_length)
# self._padded_signal_length = self.signal_length
# return window_length, hop_length
# def windows(
# self, window_duration: float, hop_duration: float, preprocess: bool = True
# ):
# """Generator which yields windows of specified duration from signal with a specified
# hop length.
# Parameters
# ----------
# window_duration : float
# Duration of every window in seconds.
# hop_duration : float
# Hop between windows in seconds.
# preprocess : bool, optional
# Whether to preprocess the signal, so that the first sample is in
# the middle of the first window, by default True
# Yields
# ------
# AudioSignal
# Each window is returned as an AudioSignal.
# """
# if preprocess:
# window_length, hop_length = self._preprocess_signal_for_windowing(
# window_duration, hop_duration
# )
# self.audio_data = self.audio_data.reshape(-1, 1, self.signal_length)
# for b in range(self.batch_size):
# i = 0
# start_idx = i * hop_length
# while True:
# start_idx = i * hop_length
# i += 1
# end_idx = start_idx + window_length
# if end_idx > self.signal_length:
# break
# yield self[b, ..., start_idx:end_idx]
# def collect_windows(
# self, window_duration: float, hop_duration: float, preprocess: bool = True
# ):
# """Reshapes signal into windows of specified duration from signal with a specified
# hop length. Window are placed along the batch dimension. Use with
# :py:func:`audiotools.core.dsp.DSPMixin.overlap_and_add` to reconstruct the
# original signal.
# Parameters
# ----------
# window_duration : float
# Duration of every window in seconds.
# hop_duration : float
# Hop between windows in seconds.
# preprocess : bool, optional
# Whether to preprocess the signal, so that the first sample is in
# the middle of the first window, by default True
# Returns
# -------
# AudioSignal
# AudioSignal unfolded with shape ``(nb * nch * num_windows, 1, window_length)``
# """
# if preprocess:
# window_length, hop_length = self._preprocess_signal_for_windowing(
# window_duration, hop_duration
# )
# # self.audio_data: (nb, nch, nt).
# unfolded = paddle.nn.functional.unfold(
# self.audio_data.reshape(-1, 1, 1, self.signal_length),
# kernel_size=(1, window_length),
# stride=(1, hop_length),
# )
# # unfolded: (nb * nch, window_length, num_windows).
# # -> (nb * nch * num_windows, 1, window_length)
# unfolded = unfolded.permute(0, 2, 1).reshape(-1, 1, window_length)
# self.audio_data = unfolded
# return self
# def overlap_and_add(self, hop_duration: float):
# """Function which takes a list of windows and overlap adds them into a
# signal the same length as ``audio_signal``.
# Parameters
# ----------
# hop_duration : float
# How much to shift for each window
# (overlap is window_duration - hop_duration) in seconds.
# Returns
# -------
# AudioSignal
# overlap-and-added signal.
# """
# hop_length = int(hop_duration * self.sample_rate)
# window_length = self.signal_length
# nb, nch = self._original_batch_size, self._original_num_channels
# unfolded = self.audio_data.reshape(nb * nch, -1, window_length).permute(0, 2, 1)
# folded = paddle.nn.functional.fold(
# unfolded,
# output_size=(1, self._padded_signal_length),
# kernel_size=(1, window_length),
# stride=(1, hop_length),
# )
# norm = paddle.ones_like(unfolded, device=unfolded.device)
# norm = paddle.nn.functional.fold(
# norm,
# output_size=(1, self._padded_signal_length),
# kernel_size=(1, window_length),
# stride=(1, hop_length),
# )
# folded = folded / norm
# folded = folded.reshape(nb, nch, -1)
# self.audio_data = folded
# self.trim(hop_length, hop_length)
# return self
def _preprocess_signal_for_windowing(self, window_duration, hop_duration):
self._original_batch_size = self.batch_size
self._original_num_channels = self.num_channels
window_length = int(window_duration * self.sample_rate)
hop_length = int(hop_duration * self.sample_rate)
if window_length % hop_length != 0:
factor = window_length // hop_length
window_length = factor * hop_length
self.zero_pad(hop_length, hop_length)
self._padded_signal_length = self.signal_length
return window_length, hop_length
def windows(self,
window_duration: float,
hop_duration: float,
preprocess: bool=True):
"""Generator which yields windows of specified duration from signal with a specified
hop length.
Parameters
----------
window_duration : float
Duration of every window in seconds.
hop_duration : float
Hop between windows in seconds.
preprocess : bool, optional
Whether to preprocess the signal, so that the first sample is in
the middle of the first window, by default True
Yields
------
AudioSignal
Each window is returned as an AudioSignal.
"""
if preprocess:
window_length, hop_length = self._preprocess_signal_for_windowing(
window_duration, hop_duration)
self.audio_data = self.audio_data.reshape([-1, 1, self.signal_length])
for b in range(self.batch_size):
i = 0
start_idx = i * hop_length
while True:
start_idx = i * hop_length
i += 1
end_idx = start_idx + window_length
if end_idx > self.signal_length:
break
yield self[b, ..., start_idx:end_idx]
def collect_windows(self,
window_duration: float,
hop_duration: float,
preprocess: bool=True):
"""Reshapes signal into windows of specified duration from signal with a specified
hop length. Window are placed along the batch dimension. Use with
:py:func:`audiotools.core.dsp.DSPMixin.overlap_and_add` to reconstruct the
original signal.
Parameters
----------
window_duration : float
Duration of every window in seconds.
hop_duration : float
Hop between windows in seconds.
preprocess : bool, optional
Whether to preprocess the signal, so that the first sample is in
the middle of the first window, by default True
Returns
-------
AudioSignal
AudioSignal unfolded with shape ``(nb * nch * num_windows, 1, window_length)``
"""
if preprocess:
window_length, hop_length = self._preprocess_signal_for_windowing(
window_duration, hop_duration)
# self.audio_data: (nb, nch, nt).
# unfolded = paddle.nn.functional.unfold(
# self.audio_data.reshape([-1, 1, 1, self.signal_length]),
# kernel_sizes=(1, window_length),
# strides=(1, hop_length),
# )
unfolded = _unfold(
self.audio_data.reshape([-1, 1, 1, self.signal_length]),
kernel_sizes=(1, window_length),
strides=(1, hop_length), )
# unfolded: (nb * nch, window_length, num_windows).
# -> (nb * nch * num_windows, 1, window_length)
unfolded = unfolded.transpose([0, 2, 1]).reshape([-1, 1, window_length])
self.audio_data = unfolded
return self
def overlap_and_add(self, hop_duration: float):
"""Function which takes a list of windows and overlap adds them into a
signal the same length as ``audio_signal``.
Parameters
----------
hop_duration : float
How much to shift for each window
(overlap is window_duration - hop_duration) in seconds.
Returns
-------
AudioSignal
overlap-and-added signal.
"""
hop_length = int(hop_duration * self.sample_rate)
window_length = self.signal_length
nb, nch = self._original_batch_size, self._original_num_channels
unfolded = self.audio_data.reshape(
[nb * nch, -1, window_length]).transpose([0, 2, 1])
# folded = paddle.nn.functional.fold(
# unfolded,
# output_sizes=(1, self._padded_signal_length),
# kernel_sizes=(1, window_length),
# strides=(1, hop_length),
# )
folded = _fold(
unfolded,
output_sizes=(1, self._padded_signal_length),
kernel_sizes=(1, window_length),
strides=(1, hop_length), )
norm = paddle.ones_like(unfolded)
# norm = paddle.nn.functional.fold(
# norm,
# output_sizes=(1, self._padded_signal_length),
# kernel_sizes=(1, window_length),
# strides=(1, hop_length),
# )
norm = _fold(
norm,
output_sizes=(1, self._padded_signal_length),
kernel_sizes=(1, window_length),
strides=(1, hop_length), )
folded = folded / norm
folded = folded.reshape([nb, nch, -1])
self.audio_data = folded
self.trim(hop_length, hop_length)
return self
def low_pass(self,
cutoffs: typing.Union[paddle.Tensor, np.ndarray, float],
@ -312,87 +365,92 @@ class DSPMixin:
self.stft_data = mag * paddle.exp(1j * phase)
return self
# def mask_low_magnitudes(
# self, db_cutoff: typing.Union[paddle.Tensor, np.ndarray, float], val: float = 0.0
# ):
# """Mask away magnitudes below a specified threshold, which
# can be different for every item in the batch.
# Parameters
# ----------
# db_cutoff : typing.Union[paddle.Tensor, np.ndarray, float]
# Decibel value for which things below it will be masked away.
# val : float, optional
# Value to fill in for masked portions, by default 0.0
# Returns
# -------
# AudioSignal
# Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
# masked audio data.
# """
# mag = self.magnitude
# log_mag = self.log_magnitude()
# db_cutoff = util.ensure_tensor(db_cutoff, ndim=mag.ndim)
# mask = log_mag < db_cutoff
# mag = mag.masked_fill(mask, val)
# self.magnitude = mag
# return self
# def shift_phase(self, shift: typing.Union[paddle.Tensor, np.ndarray, float]):
# """Shifts the phase by a constant value.
# Parameters
# ----------
# shift : typing.Union[paddle.Tensor, np.ndarray, float]
# What to shift the phase by.
# Returns
# -------
# AudioSignal
# Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
# masked audio data.
# """
# shift = util.ensure_tensor(shift, ndim=self.phase.ndim)
# self.phase = self.phase + shift
# return self
# def corrupt_phase(self, scale: typing.Union[paddle.Tensor, np.ndarray, float]):
# """Corrupts the phase randomly by some scaled value.
# Parameters
# ----------
# scale : typing.Union[paddle.Tensor, np.ndarray, float]
# Standard deviation of noise to add to the phase.
# Returns
# -------
# AudioSignal
# Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
# masked audio data.
# """
# scale = util.ensure_tensor(scale, ndim=self.phase.ndim)
# self.phase = self.phase + scale * paddle.randn_like(self.phase)
# return self
# def preemphasis(self, coef: float = 0.85):
# """Applies pre-emphasis to audio signal.
# Parameters
# ----------
# coef : float, optional
# How much pre-emphasis to apply, lower values do less. 0 does nothing.
# by default 0.85
# Returns
# -------
# AudioSignal
# Pre-emphasized signal.
# """
# kernel = paddle.to_tensor([1, -coef, 0]).view(1, 1, -1).to(self.device)
# x = self.audio_data.reshape(-1, 1, self.signal_length)
# x = paddle.nn.functional.conv1d(x, kernel, padding=1)
# self.audio_data = x.reshape(*self.audio_data.shape)
# return self
def mask_low_magnitudes(
self,
db_cutoff: typing.Union[paddle.Tensor, np.ndarray, float],
val: float=0.0):
"""Mask away magnitudes below a specified threshold, which
can be different for every item in the batch.
Parameters
----------
db_cutoff : typing.Union[paddle.Tensor, np.ndarray, float]
Decibel value for which things below it will be masked away.
val : float, optional
Value to fill in for masked portions, by default 0.0
Returns
-------
AudioSignal
Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
masked audio data.
"""
mag = self.magnitude
log_mag = self.log_magnitude()
db_cutoff = util.ensure_tensor(db_cutoff, ndim=mag.ndim)
mask = log_mag < db_cutoff
mag = mag.masked_fill(mask, val)
self.magnitude = mag
return self
def shift_phase(self,
shift: typing.Union[paddle.Tensor, np.ndarray, float]):
"""Shifts the phase by a constant value.
Parameters
----------
shift : typing.Union[paddle.Tensor, np.ndarray, float]
What to shift the phase by.
Returns
-------
AudioSignal
Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
masked audio data.
"""
shift = util.ensure_tensor(shift, ndim=self.phase.ndim)
self.phase = self.phase + shift
return self
def corrupt_phase(self,
scale: typing.Union[paddle.Tensor, np.ndarray, float]):
"""Corrupts the phase randomly by some scaled value.
Parameters
----------
scale : typing.Union[paddle.Tensor, np.ndarray, float]
Standard deviation of noise to add to the phase.
Returns
-------
AudioSignal
Signal with ``stft_data`` manipulated. Apply ``.istft()`` to get the
masked audio data.
"""
scale = util.ensure_tensor(scale, ndim=self.phase.ndim)
self.phase = self.phase + scale * paddle.randn(
shape=self.phase.shape, dtype=self.phase.dtype)
return self
def preemphasis(self, coef: float=0.85):
"""Applies pre-emphasis to audio signal.
Parameters
----------
coef : float, optional
How much pre-emphasis to apply, lower values do less. 0 does nothing.
by default 0.85
Returns
-------
AudioSignal
Pre-emphasized signal.
"""
kernel = paddle.to_tensor([1, -coef, 0]).reshape([1, 1, -1])
x = self.audio_data.reshape([-1, 1, self.signal_length])
x = paddle.nn.functional.conv1d(
x.astype(kernel.dtype), kernel, padding=1)
self.audio_data = x.reshape(self.audio_data.shape)
return self

@ -1,8 +1,8 @@
flatten_dict
gradio
IPython
librosa
markdown2
librosa==0.8.1markdown2
numpy==1.23.5
pyloudnorm
pytest
pytest-xdist

@ -13,7 +13,7 @@ from audiotools import AudioSignal
def test_io():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(pathlib.Path(audio_path))
with tempfile.NamedTemporaryFile(suffix=".wav") as f:
@ -61,7 +61,7 @@ def test_io():
assert signal.audio_data.ndim == 3
assert paddle.all(signal.samples == signal.audio_data)
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
assert AudioSignal(audio_path).hash() == AudioSignal(audio_path).hash()
assert AudioSignal(audio_path).hash() != AudioSignal(audio_path).normalize(
-20).hash()
@ -71,7 +71,7 @@ def test_io():
def test_copy_and_clone():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path)
signal.stft()
signal.loudness()
@ -369,7 +369,7 @@ def test_trim():
def test_to_from_ops():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path)
signal.stft()
signal.loudness()
@ -384,7 +384,7 @@ def test_to_from_ops():
def test_device():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path)
signal.to("cpu")
@ -397,7 +397,7 @@ def test_device():
def test_stft(window_length, hop_length, window_type):
if hop_length >= window_length:
hop_length = window_length // 2
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
stft_params = audiotools.STFTParams(
window_length=window_length,
hop_length=hop_length,
@ -456,7 +456,7 @@ def test_stft(window_length, hop_length, window_type):
def test_log_magnitude():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
for _ in range(10):
signal = AudioSignal.excerpt(audio_path, duration=5.0)
magnitude = signal.magnitude.numpy()[0, 0]
@ -474,7 +474,7 @@ def test_log_magnitude():
def test_mel_spectrogram(n_mels, window_length, hop_length, window_type):
if hop_length >= window_length:
hop_length = window_length // 2
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
stft_params = audiotools.STFTParams(
window_length=window_length,
hop_length=hop_length,
@ -492,7 +492,7 @@ def test_mel_spectrogram(n_mels, window_length, hop_length, window_type):
def test_mfcc(n_mfcc, n_mels, window_length, hop_length):
if hop_length >= window_length:
hop_length = window_length // 2
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
stft_params = audiotools.STFTParams(
window_length=window_length, hop_length=hop_length)
for _stft_params in [None, stft_params]:

@ -0,0 +1,48 @@
import sys
from pathlib import Path
import numpy as np
sys.path.append("/home/aistudio/PaddleSpeech/audio")
from audiotools import AudioSignal
from visualdl import LogWriter
def test_specshow():
array = np.zeros((1, 16000))
AudioSignal(array, sample_rate=16000).specshow()
AudioSignal(array, sample_rate=16000).specshow(preemphasis=True)
AudioSignal(
array, sample_rate=16000).specshow(
title="test", preemphasis=True)
AudioSignal(
array, sample_rate=16000).specshow(
format=False, preemphasis=True)
AudioSignal(
array, sample_rate=16000).specshow(
format=False, preemphasis=False, y_axis="mel")
def test_waveplot():
array = np.zeros((1, 16000))
AudioSignal(array, sample_rate=16000).waveplot()
def test_wavespec():
array = np.zeros((1, 16000))
AudioSignal(array, sample_rate=16000).wavespec()
def test_write_audio_to_tb():
signal = AudioSignal("./audio/spk/f10_script4_produced.mp3", duration=5)
Path("./scratch").mkdir(parents=True, exist_ok=True)
writer = LogWriter("./scratch/")
signal.write_audio_to_tb("tag", writer)
def test_save_image():
signal = AudioSignal(
"./audio/spk/f10_script4_produced.wav", duration=10, offset=10)
Path("./scratch").mkdir(parents=True, exist_ok=True)
signal.save_image("./scratch/image.png")

@ -0,0 +1,178 @@
import sys
import numpy as np
import paddle
import pytest
sys.path.append("/home/aistudio/PaddleSpeech/audio")
from audiotools import AudioSignal
from audiotools.core.util import sample_from_dist
@pytest.mark.parametrize("window_duration", [0.1, 0.25, 0.5, 1.0])
@pytest.mark.parametrize("sample_rate", [8000, 16000, 22050, 44100])
@pytest.mark.parametrize("duration", [0.5, 1.0, 2.0, 10.0])
def test_overlap_add(duration, sample_rate, window_duration):
np.random.seed(0)
if duration > window_duration:
spk_signal = AudioSignal.batch([
AudioSignal.excerpt(
"./audio/spk/f10_script4_produced.wav", duration=duration)
for _ in range(16)
])
spk_signal.resample(sample_rate)
noise = paddle.randn([16, 1, int(duration * sample_rate)])
nz_signal = AudioSignal(noise, sample_rate=sample_rate)
def _test(signal):
hop_duration = window_duration / 2
windowed_signal = signal.deepcopy().collect_windows(window_duration,
hop_duration)
recombined = windowed_signal.overlap_and_add(hop_duration)
assert recombined == signal
assert np.allclose(recombined.audio_data, signal.audio_data, 1e-3)
_test(nz_signal)
_test(spk_signal)
@pytest.mark.parametrize("window_duration", [0.1, 0.25, 0.5, 1.0])
@pytest.mark.parametrize("sample_rate", [8000, 16000, 22050, 44100])
@pytest.mark.parametrize("duration", [0.5, 1.0, 2.0, 10.0])
def test_inplace_overlap_add(duration, sample_rate, window_duration):
np.random.seed(0)
if duration > window_duration:
spk_signal = AudioSignal.batch([
AudioSignal.excerpt(
"./audio/spk/f10_script4_produced.wav", duration=duration)
for _ in range(16)
])
spk_signal.resample(sample_rate)
noise = paddle.randn([16, 1, int(duration * sample_rate)])
nz_signal = AudioSignal(noise, sample_rate=sample_rate)
def _test(signal):
hop_duration = window_duration / 2
windowed_signal = signal.deepcopy().collect_windows(window_duration,
hop_duration)
# Compare in-place with unfold results
for i, window in enumerate(
signal.deepcopy().windows(window_duration, hop_duration)):
assert np.allclose(window.audio_data,
windowed_signal.audio_data[i])
_test(nz_signal)
_test(spk_signal)
def test_low_pass():
sample_rate = 44100
f = 440
t = paddle.arange(0, 1, 1 / sample_rate)
sine_wave = paddle.sin(2 * np.pi * f * t)
window = AudioSignal.get_window("hann", sine_wave.shape[-1])
sine_wave = sine_wave * window
signal = AudioSignal(sine_wave.unsqueeze(0), sample_rate=sample_rate)
out = signal.deepcopy().low_pass(220)
assert out.audio_data.abs().max() < 1e-4
out = signal.deepcopy().low_pass(880)
assert (out - signal).audio_data.abs().max() < 1e-3
batch = AudioSignal.batch(
[signal.deepcopy(), signal.deepcopy(), signal.deepcopy()])
cutoffs = [220, 880, 220]
out = batch.deepcopy().low_pass(cutoffs)
assert out.audio_data[0].abs().max() < 1e-4
assert out.audio_data[2].abs().max() < 1e-4
assert (out - batch).audio_data[1].abs().max() < 1e-3
def test_high_pass():
sample_rate = 44100
f = 440
t = paddle.arange(0, 1, 1 / sample_rate)
sine_wave = paddle.sin(2 * np.pi * f * t)
window = AudioSignal.get_window("hann", sine_wave.shape[-1])
sine_wave = sine_wave * window
signal = AudioSignal(sine_wave.unsqueeze(0), sample_rate=sample_rate)
out = signal.deepcopy().high_pass(220)
assert (signal - out).audio_data.abs().max() < 1e-4
def test_mask_frequencies():
sample_rate = 44100
fs = paddle.to_tensor([500.0, 2000.0, 8000.0, 32000.0])[None]
t = paddle.arange(0, 1, 1 / sample_rate)[:, None]
sine_wave = paddle.sin(2 * np.pi * t @ fs).sum(axis=-1)
sine_wave = AudioSignal(sine_wave, sample_rate)
masked_sine_wave = sine_wave.mask_frequencies(fmin_hz=1500, fmax_hz=10000)
fs2 = paddle.to_tensor([500.0, 32000.0])[None]
sine_wave2 = paddle.sin(2 * np.pi * t @ fs).sum(axis=-1)
sine_wave2 = AudioSignal(sine_wave2, sample_rate)
assert paddle.allclose(masked_sine_wave.audio_data, sine_wave2.audio_data)
def test_mask_timesteps():
sample_rate = 44100
f = 440
t = paddle.linspace(0, 1, sample_rate)
sine_wave = paddle.sin(2 * np.pi * f * t)
sine_wave = AudioSignal(sine_wave, sample_rate)
masked_sine_wave = sine_wave.mask_timesteps(tmin_s=0.25, tmax_s=0.75)
masked_sine_wave.istft()
mask = ((0.3 < t) & (t < 0.7))[None, None]
assert paddle.allclose(
masked_sine_wave.audio_data[mask],
paddle.zeros_like(masked_sine_wave.audio_data[mask]), )
def test_shift_phase():
sample_rate = 44100
f = 440
t = paddle.linspace(0, 1, sample_rate)
sine_wave = paddle.sin(2 * np.pi * f * t)
sine_wave = AudioSignal(sine_wave, sample_rate)
sine_wave2 = sine_wave.clone()
shifted_sine_wave = sine_wave.shift_phase(np.pi)
shifted_sine_wave.istft()
sine_wave2.phase = sine_wave2.phase + np.pi
sine_wave2.istft()
assert paddle.allclose(shifted_sine_wave.audio_data, sine_wave2.audio_data)
def test_corrupt_phase():
sample_rate = 44100
f = 440
t = paddle.linspace(0, 1, sample_rate)
sine_wave = paddle.sin(2 * np.pi * f * t)
sine_wave = AudioSignal(sine_wave, sample_rate)
sine_wave2 = sine_wave.clone()
shifted_sine_wave = sine_wave.corrupt_phase(scale=np.pi)
shifted_sine_wave.istft()
assert (sine_wave2.phase - shifted_sine_wave.phase).abs().mean() > 0.0
assert ((sine_wave2.phase - shifted_sine_wave.phase).std() / np.pi) < 1.0
def test_preemphasis():
x = AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=5)
import matplotlib.pyplot as plt
x.specshow(preemphasis=False)
x.specshow(preemphasis=True)
x.preemphasis()

@ -8,7 +8,7 @@ from audiotools import AudioSignal
def test_normalize():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=10)
signal = signal.normalize()
assert np.allclose(signal.loudness(), -24, atol=1e-1)
@ -35,7 +35,7 @@ def test_normalize():
def test_volume_change():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=10)
boost = 3
@ -50,10 +50,10 @@ def test_volume_change():
def test_mix():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=10)
audio_path = "tests/audiotools/audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
audio_path = "./audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
nz = AudioSignal(audio_path, offset=10, duration=10)
spk.deepcopy().mix(nz, snr=-10)
@ -61,10 +61,10 @@ def test_mix():
assert np.allclose(snr, -10, atol=1)
# Test in batch
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=10)
audio_path = "tests/audiotools/audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
audio_path = "./audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
nz = AudioSignal(audio_path, offset=10, duration=10)
batch_size = 4
@ -86,7 +86,7 @@ def test_mix():
def test_convolve():
np.random.seed(6) # Found a failing seed
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=10)
impulse = np.zeros((1, 16000), dtype="float32")
@ -106,7 +106,7 @@ def test_convolve():
assert convolved == spk_batch
# Short duration
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=0.1)
impulse = np.zeros((1, 16000), dtype="float32")
@ -128,14 +128,14 @@ def test_convolve():
def test_pipeline():
# An actual IR, no batching
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=5)
audio_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
audio_path = "./audio/ir/h179_Bar_1txts.wav"
ir = AudioSignal(audio_path)
spk.deepcopy().convolve(ir)
audio_path = "tests/audiotools/audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
audio_path = "./audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
nz = AudioSignal(audio_path, offset=10, duration=5)
batch_size = 16
@ -146,7 +146,7 @@ def test_pipeline():
# def test_codec():
# audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
# audio_path = "./audio/spk/f10_script4_produced.wav"
# spk = AudioSignal(audio_path, offset=10, duration=10)
# with pytest.raises(ValueError):
@ -156,7 +156,7 @@ def test_pipeline():
# out = spk.deepcopy().apply_codec("8-bit")
# def test_pitch_shift():
# audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
# audio_path = "./audio/spk/f10_script4_produced.wav"
# spk = AudioSignal(audio_path, offset=10, duration=1)
# single = spk.deepcopy().pitch_shift(5)
@ -169,7 +169,7 @@ def test_pipeline():
# assert np.allclose(batched[0].audio_data, single[0].audio_data)
# def test_time_stretch():
# audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
# audio_path = "./audio/spk/f10_script4_produced.wav"
# spk = AudioSignal(audio_path, offset=10, duration=1)
# single = spk.deepcopy().time_stretch(0.8)
@ -184,7 +184,7 @@ def test_pipeline():
@pytest.mark.parametrize("n_bands", [1, 2, 4, 8, 12, 16])
def test_mel_filterbank(n_bands):
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=1)
fbank = spk.deepcopy().mel_filterbank(n_bands)
@ -192,8 +192,7 @@ def test_mel_filterbank(n_bands):
# Check if it works in batches.
spk_batch = AudioSignal.batch([
AudioSignal.excerpt(
"tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
for _ in range(16)
])
fbank = spk_batch.deepcopy().mel_filterbank(n_bands)
@ -203,7 +202,7 @@ def test_mel_filterbank(n_bands):
@pytest.mark.parametrize("n_bands", [1, 2, 4, 8, 12, 16])
def test_equalizer(n_bands):
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=10)
db = -3 + 1 * paddle.rand([n_bands])
@ -212,15 +211,14 @@ def test_equalizer(n_bands):
db = -3 + 1 * np.random.rand(n_bands)
spk.deepcopy().equalizer(db)
audio_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
audio_path = "./audio/ir/h179_Bar_1txts.wav"
ir = AudioSignal(audio_path)
db = -3 + 1 * paddle.rand([n_bands])
spk.deepcopy().convolve(ir.equalizer(db))
spk_batch = AudioSignal.batch([
AudioSignal.excerpt(
"tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
for _ in range(16)
])
@ -231,13 +229,12 @@ def test_equalizer(n_bands):
def test_clip_distortion():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=2)
clipped = spk.deepcopy().clip_distortion(0.05)
spk_batch = AudioSignal.batch([
AudioSignal.excerpt(
"tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
for _ in range(16)
])
percs = paddle.to_tensor(np.random.uniform(size=(16, ))).astype("float32")
@ -249,7 +246,7 @@ def test_clip_distortion():
@pytest.mark.parametrize("quant_ch", [2, 4, 8, 16, 32, 64, 128])
def test_quantization(quant_ch):
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=2)
quantized = spk.deepcopy().quantization(quant_ch)
@ -260,8 +257,7 @@ def test_quantization(quant_ch):
assert found_quant_ch <= quant_ch
spk_batch = AudioSignal.batch([
AudioSignal.excerpt(
"tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
for _ in range(16)
])
@ -277,7 +273,7 @@ def test_quantization(quant_ch):
@pytest.mark.parametrize("quant_ch", [2, 4, 8, 16, 32, 64, 128])
def test_mulaw_quantization(quant_ch):
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
spk = AudioSignal(audio_path, offset=10, duration=2)
quantized = spk.deepcopy().mulaw_quantization(quant_ch)
@ -288,8 +284,7 @@ def test_mulaw_quantization(quant_ch):
assert found_quant_ch <= quant_ch
spk_batch = AudioSignal.batch([
AudioSignal.excerpt(
"tests/audiotools/audio/spk/f10_script4_produced.wav", duration=2)
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
for _ in range(16)
])
@ -304,7 +299,7 @@ def test_mulaw_quantization(quant_ch):
def test_impulse_response_augmentation():
audio_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
audio_path = "./audio/ir/h179_Bar_1txts.wav"
batch_size = 16
ir = AudioSignal(audio_path)
ir_batch = AudioSignal.batch([ir for _ in range(batch_size)])
@ -330,8 +325,8 @@ def test_impulse_response_augmentation():
def test_apply_ir():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
ir_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
ir_path = "./audio/ir/h179_Bar_1txts.wav"
spk = AudioSignal(audio_path, offset=10, duration=2)
ir = AudioSignal(ir_path)

@ -9,8 +9,8 @@ from audiotools import AudioSignal
def test_audio_grad():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
ir_path = "tests/audiotools/audio/ir/h179_Bar_1txts.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
ir_path = "./audio/ir/h179_Bar_1txts.wav"
def _test_audio_grad(attr: str, target=True, kwargs: dict={}):
signal = AudioSignal(audio_path)
@ -153,7 +153,7 @@ def test_audio_grad():
def test_batch_grad():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path)
signal.audio_data.stop_gradient = False

@ -97,75 +97,5 @@ class TestHighPassFilters(_BaseTest):
self.assertSimilar(y, y2, x)
# class TestBandPassFilters(_BaseTest):
# def setUp(self):
# paddle.seed(1234)
# random.seed(1234)
# def test_keep_or_kill(self):
# for _ in range(10):
# freq = random.uniform(0.01, 0.4)
# sr = 1024
# tone = pure_tone(freq * sr, sr=sr, dur=10)
# # For this test we accept 5% tolerance in amplitude, or -26dB in power.
# tol = 5
# zeros = 16
# y_pass = filters.bandpass_filter(tone, 0.9 * freq, 1.1 * freq, zeros=zeros)
# self.assertSimilar(y_pass, tone, tone, f"freq={freq}, pass", tol=tol)
# y_killed = filters.bandpass_filter(tone, 1.1 * freq, 1.2 * freq, zeros=zeros)
# self.assertSimilar(y_killed, 0 * tone, tone, f"freq={freq}, kill", tol=tol)
# y_killed = filters.bandpass_filter(tone, 0.8 * freq, 0.9 * freq, zeros=zeros)
# self.assertSimilar(y_killed, 0 * tone, tone, f"freq={freq}, kill", tol=tol)
# def test_fft_nofft(self):
# for _ in range(10):
# x = paddle.randn([1024])
# freq = random.uniform(0.01, 0.5)
# freq2 = random.uniform(freq, 0.5)
# y_fft = filters.bandpass_filter(x, freq, freq2, fft=True)
# y_ref = filters.bandpass_filter(x, freq, freq2, fft=False)
# self.assertSimilar(y_fft, y_ref, x, f"freq={freq}", tol=0.01)
# def test_constant(self):
# x = paddle.ones([2048])
# for zeros in [4, 10]:
# for freq in [0.01, 0.1]:
# y = filters.bandpass_filter(x, freq, 1.2 * freq, zeros=zeros)
# self.assertLessEqual(y.abs().mean(), 1e-6, (zeros, freq))
# def test_stride(self):
# x = paddle.randn([1024])
# y = filters.bandpass_filter(x, 0.1, 0.2, stride=1)[::3]
# y2 = filters.bandpass_filter(x, 0.1, 0.2, stride=3)
# self.assertEqual(y.shape, y2.shape)
# self.assertSimilar(y, y2, x)
# y = filters.bandpass_filter(x, 0.1, 0.2, stride=1, pad=False)[::3]
# y2 = filters.bandpass_filter(x, 0.1, 0.2, stride=3, pad=False)
# self.assertEqual(y.shape, y2.shape)
# self.assertSimilar(y, y2, x)
# def test_same_as_highpass(self):
# x = paddle.randn([1024])
# y_ref = highpass_filter(x, 0.2)
# y = filters.bandpass_filter(x, 0.2, 0.5)
# self.assertSimilar(y, y_ref, x)
# def test_same_as_lowpass(self):
# x = paddle.randn([1024])
# y_ref = filters.lowpass_filter(x, 0.2)
# y = filters.bandpass_filter(x, 0.0, 0.2)
# self.assertSimilar(y, y_ref, x)
if __name__ == "__main__":
unittest.main()

@ -13,7 +13,7 @@ ATOL = 1e-1
def test_loudness_against_pyln():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=5, duration=10)
signal_loudness = signal.loudness()
@ -24,7 +24,7 @@ def test_loudness_against_pyln():
def test_loudness_short():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=0.25)
signal_loudness = signal.loudness()
@ -58,7 +58,7 @@ def test_batch_loudness():
# Tests below are copied from pyloudnorm
def test_integrated_loudness():
data, rate = sf.read("tests/audiotools/audio/loudness/sine_1000.wav")
data, rate = sf.read("./audio/loudness/sine_1000.wav")
meter = Meter(rate)
loudness = meter(data)
@ -67,8 +67,7 @@ def test_integrated_loudness():
def test_rel_gate_test():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_RelGateTest.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_RelGateTest.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -77,8 +76,7 @@ def test_rel_gate_test():
def test_abs_gate_test():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_AbsGateTest.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_AbsGateTest.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -87,8 +85,7 @@ def test_abs_gate_test():
def test_24LKFS_25Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_25Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_25Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -97,8 +94,7 @@ def test_24LKFS_25Hz_2ch():
def test_24LKFS_100Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_100Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_100Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -107,8 +103,7 @@ def test_24LKFS_100Hz_2ch():
def test_24LKFS_500Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_500Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_500Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -117,8 +112,7 @@ def test_24LKFS_500Hz_2ch():
def test_24LKFS_1000Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_1000Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_1000Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -127,8 +121,7 @@ def test_24LKFS_1000Hz_2ch():
def test_24LKFS_2000Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_2000Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_2000Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -137,8 +130,7 @@ def test_24LKFS_2000Hz_2ch():
def test_24LKFS_10000Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_24LKFS_10000Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_24LKFS_10000Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -147,8 +139,7 @@ def test_24LKFS_10000Hz_2ch():
def test_23LKFS_25Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_25Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_25Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -157,8 +148,7 @@ def test_23LKFS_25Hz_2ch():
def test_23LKFS_100Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_100Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_100Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -167,8 +157,7 @@ def test_23LKFS_100Hz_2ch():
def test_23LKFS_500Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_500Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_500Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -177,8 +166,7 @@ def test_23LKFS_500Hz_2ch():
def test_23LKFS_1000Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_1000Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_1000Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -187,8 +175,7 @@ def test_23LKFS_1000Hz_2ch():
def test_23LKFS_2000Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_2000Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_2000Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -197,8 +184,7 @@ def test_23LKFS_2000Hz_2ch():
def test_23LKFS_10000Hz_2ch():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_23LKFS_10000Hz_2ch.wav")
data, rate = sf.read("./audio/loudness/1770-2_Comp_23LKFS_10000Hz_2ch.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -208,7 +194,7 @@ def test_23LKFS_10000Hz_2ch():
def test_18LKFS_frequency_sweep():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Comp_18LKFS_FrequencySweep.wav")
"./audio/loudness/1770-2_Comp_18LKFS_FrequencySweep.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -218,7 +204,7 @@ def test_18LKFS_frequency_sweep():
def test_conf_stereo_vinL_R_23LKFS():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Conf_Stereo_VinL+R-23LKFS.wav")
"./audio/loudness/1770-2_Conf_Stereo_VinL+R-23LKFS.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -228,8 +214,7 @@ def test_conf_stereo_vinL_R_23LKFS():
def test_conf_monovoice_music_24LKFS():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Conf_Mono_Voice+Music-24LKFS.wav"
)
"./audio/loudness/1770-2_Conf_Mono_Voice+Music-24LKFS.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -239,8 +224,7 @@ def test_conf_monovoice_music_24LKFS():
def conf_monovoice_music_24LKFS():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Conf_Mono_Voice+Music-24LKFS.wav"
)
"./audio/loudness/1770-2_Conf_Mono_Voice+Music-24LKFS.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -250,8 +234,7 @@ def conf_monovoice_music_24LKFS():
def test_conf_monovoice_music_23LKFS():
data, rate = sf.read(
"tests/audiotools/audio/loudness/1770-2_Conf_Mono_Voice+Music-23LKFS.wav"
)
"./audio/loudness/1770-2_Conf_Mono_Voice+Music-23LKFS.wav")
meter = Meter(rate)
loudness = meter.integrated_loudness(data)
@ -266,7 +249,7 @@ def test_fir_accuracy():
transforms.HighPass(prob=0.5),
transforms.Equalizer(prob=0.5),
prob=0.5, )
loader = datasets.AudioLoader(sources=["tests/audiotools/audio/spk.csv"])
loader = datasets.AudioLoader(sources=["./audio/spk.csv"])
dataset = datasets.AudioDataset(
loader,
44100,

@ -66,8 +66,7 @@ def test_find_audio():
assert not audio_files
# Make sure it works with single audio files
audio_files = util.find_audio(
"tests/audiotools/audio/spk//f10_script4_produced.wav")
audio_files = util.find_audio("./audio/spk//f10_script4_produced.wav")
# Make sure it works with globs
audio_files = util.find_audio("tests/**/*.wav")

@ -45,7 +45,7 @@ def test_audio_dataset():
tfm.Silence(prob=0.5),
], )
loader = audiotools.data.datasets.AudioLoader(
sources=["tests/audiotools/audio/spk.csv"],
sources=["./audio/spk.csv"],
transform=transform, )
dataset = audiotools.data.datasets.AudioDataset(
loader,
@ -161,11 +161,10 @@ def test_loader_out_of_range():
def test_dataset_pipeline():
transform = tfm.Compose([
tfm.RoomImpulseResponse(sources=["tests/audiotools/audio/irs.csv"]),
tfm.BackgroundNoise(sources=["tests/audiotools/audio/noises.csv"]),
tfm.RoomImpulseResponse(sources=["./audio/irs.csv"]),
tfm.BackgroundNoise(sources=["./audio/noises.csv"]),
])
loader = audiotools.data.datasets.AudioLoader(
sources=["tests/audiotools/audio/spk.csv"])
loader = audiotools.data.datasets.AudioLoader(sources=["./audio/spk.csv"])
dataset = audiotools.data.datasets.AudioDataset(
loader,
44100,

@ -12,13 +12,11 @@ from audiotools.data import preprocess
def test_create_csv():
with tempfile.NamedTemporaryFile(suffix=".csv") as f:
preprocess.create_csv(
find_audio("./tests/audiotools/audio/spk", ext=["wav"]),
f.name,
loudness=True)
find_audio("././audio/spk", ext=["wav"]), f.name, loudness=True)
def test_create_csv_with_empty_rows():
audio_files = find_audio("./tests/audiotools/audio/spk", ext=["wav"])
audio_files = find_audio("././audio/spk", ext=["wav"])
audio_files.insert(0, "")
audio_files.insert(2, "")

@ -49,13 +49,13 @@ def test_transform(transform_name):
kwargs = {}
if transform_name == "BackgroundNoise":
kwargs["sources"] = ["tests/audiotools/audio/noises.csv"]
kwargs["sources"] = ["./audio/noises.csv"]
if transform_name == "RoomImpulseResponse":
kwargs["sources"] = ["tests/audiotools/audio/irs.csv"]
kwargs["sources"] = ["./audio/irs.csv"]
if transform_name == "CrossTalk":
kwargs["sources"] = ["tests/audiotools/audio/spk.csv"]
kwargs["sources"] = ["./audio/spk.csv"]
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
signal.metadata["loudness"] = AudioSignal(
audio_path).ffmpeg_loudness().item()
@ -102,12 +102,12 @@ def test_transform(transform_name):
def test_compose_basic():
seed = 0
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
transform = tfm.Compose(
[
tfm.RoomImpulseResponse(sources=["tests/audiotools/audio/irs.csv"]),
tfm.BackgroundNoise(sources=["tests/audiotools/audio/noises.csv"]),
tfm.RoomImpulseResponse(sources=["./audio/irs.csv"]),
tfm.BackgroundNoise(sources=["./audio/noises.csv"]),
], )
kwargs = transform.instantiate(seed, signal)
@ -143,7 +143,7 @@ def test_compose_with_duplicate_transforms():
full_mul = np.prod(muls)
kwargs = transform.instantiate(0)
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
output = transform(signal.clone(), **kwargs)
@ -162,7 +162,7 @@ def test_nested_compose():
full_mul = np.prod(muls)
kwargs = transform.instantiate(0)
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
output = transform(signal.clone(), **kwargs)
@ -176,7 +176,7 @@ def test_compose_filtering():
transform = tfm.Compose([MulTransform(x, name=str(x)) for x in muls])
kwargs = transform.instantiate(0)
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
for s in range(len(muls)):
@ -199,7 +199,7 @@ def test_sequential_compose():
full_mul = np.prod(muls)
kwargs = transform.instantiate(0)
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
output = transform(signal.clone(), **kwargs)
@ -210,11 +210,11 @@ def test_sequential_compose():
def test_choose_basic():
seed = 0
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
transform = tfm.Choose([
tfm.RoomImpulseResponse(sources=["tests/audiotools/audio/irs.csv"]),
tfm.BackgroundNoise(sources=["tests/audiotools/audio/noises.csv"]),
tfm.RoomImpulseResponse(sources=["./audio/irs.csv"]),
tfm.BackgroundNoise(sources=["./audio/noises.csv"]),
])
kwargs = transform.instantiate(seed, signal)
@ -251,7 +251,7 @@ def test_choose_basic():
def test_choose_weighted():
seed = 0
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
transform = tfm.Choose(
[
MulTransform(0.0),
@ -277,7 +277,7 @@ def test_choose_weighted():
def test_choose_with_compose():
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
transform = tfm.Choose([
@ -296,7 +296,7 @@ def test_choose_with_compose():
def test_repeat():
seed = 0
audio_path = "tests/audiotools/audio/spk/f10_script4_produced.wav"
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
kwargs = {}
@ -356,7 +356,7 @@ class DummyData(paddle.io.Dataset):
def test_masking():
dataset = DummyData("tests/audiotools/audio/spk/f10_script4_produced.wav")
dataset = DummyData("./audio/spk/f10_script4_produced.wav")
dataloader = paddle.io.DataLoader(
dataset,
batch_size=16,
@ -385,8 +385,7 @@ def test_nested_masking():
],
prob=0.9, )
loader = audiotools.data.datasets.AudioLoader(
sources=["tests/audiotools/audio/spk.csv"])
loader = audiotools.data.datasets.AudioLoader(sources=["./audio/spk.csv"])
dataset = audiotools.data.datasets.AudioDataset(
loader,
44100,

@ -0,0 +1,4 @@
python -m pip install -r ../audiotools/requirements.txt
# wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/static_ds2online_inputs.pickle
# wget
find . -name "*✅.py" | xargs python -m pytest

@ -13,8 +13,7 @@ def test_audio_table():
audio_dict = {}
audio_dict["inputs"] = [
AudioSignal.excerpt(
"tests/audiotools/audio/spk/f10_script4_produced.wav", duration=5)
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=5)
for _ in range(3)
]
audio_dict["outputs"] = []

@ -31,6 +31,13 @@ function main(){
cd ${speech_ci_path}/server/offline
bash test_server_client.sh
echo "End server"
echo "Start testing audiotools"
cd ${speech_ci_path}/../../audio/tests/audiotools
bash test_audiotools.sh
echo "End testing audiotools"
}
main

@ -115,3 +115,4 @@ paddlespeech whisper --task translate --input ./zh.wav
paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav
echo -e "\033[32mTest success !!!\033[0m"

Loading…
Cancel
Save