You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/audio/tests/audiotools/data/test_transforms.py

454 lines
14 KiB

# MIT License, Copyright (c) 2023-Present, Descript.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Modified from audiotools(https://github.com/descriptinc/audiotools/blob/master/tests/data/test_transforms.py)
import inspect
import sys
import warnings
from pathlib import Path
import numpy as np
import paddle
import pytest
from audio import audiotools
from audio.audiotools import AudioSignal
from audio.audiotools import util
from audio.audiotools.data import transforms as tfm
from audio.audiotools.data.datasets import AudioDataset
from paddlespeech.vector.training.seeding import seed_everything
non_deterministic_transforms = ["TimeNoise", "FrequencyNoise"]
transforms_to_test = []
for x in dir(tfm):
if hasattr(getattr(tfm, x), "transform"):
if x not in [
"Compose",
"Choose",
"Repeat",
"RepeatUpTo",
# The above 4 transforms are currently excluded from testing at 1e-4 precision due to potential accuracy issues
"BackgroundNoise",
"Equalizer",
"FrequencyNoise",
"RoomImpulseResponse"
]:
transforms_to_test.append(x)
def _compare_transform(transform_name, signal):
regression_data = Path(f"regression/transforms/{transform_name}.wav")
regression_data.parent.mkdir(exist_ok=True, parents=True)
if regression_data.exists():
regression_signal = AudioSignal(regression_data)
assert paddle.allclose(
signal.audio_data, regression_signal.audio_data, atol=1e-4)
else:
signal.write(regression_data)
@pytest.mark.parametrize("transform_name", transforms_to_test)
def test_transform(transform_name):
seed = 0
seed_everything(seed)
transform_cls = getattr(tfm, transform_name)
kwargs = {}
if transform_name == "BackgroundNoise":
kwargs["sources"] = ["./audio/noises.csv"]
if transform_name == "RoomImpulseResponse":
kwargs["sources"] = ["./audio/irs.csv"]
if transform_name == "CrossTalk":
kwargs["sources"] = ["./audio/spk.csv"]
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
signal.metadata["loudness"] = AudioSignal(
audio_path).ffmpeg_loudness().item()
transform = transform_cls(prob=1.0, **kwargs)
kwargs = transform.instantiate(seed, signal)
for k in kwargs[transform_name]:
assert k in transform.keys
output = transform(signal, **kwargs)
assert isinstance(output, AudioSignal)
_compare_transform(transform_name, output)
if transform_name in non_deterministic_transforms:
return
# Test that if you make a batch of signals and call it,
# the first item in the batch is still the same as above.
batch_size = 4
signal = AudioSignal(audio_path, offset=10, duration=2)
signal_batch = AudioSignal.batch(
[signal.clone() for _ in range(batch_size)])
signal_batch.metadata["loudness"] = AudioSignal(
audio_path).ffmpeg_loudness().item()
states = [seed + idx for idx in list(range(batch_size))]
kwargs = transform.batch_instantiate(states, signal_batch)
batch_output = transform(signal_batch, **kwargs)
assert batch_output[0] == output
## Test that you can apply transform with the same args twice.
signal = AudioSignal(audio_path, offset=10, duration=2)
signal.metadata["loudness"] = AudioSignal(
audio_path).ffmpeg_loudness().item()
kwargs = transform.instantiate(seed, signal)
output_a = transform(signal.clone(), **kwargs)
output_b = transform(signal.clone(), **kwargs)
assert output_a == output_b
def test_compose_basic():
seed = 0
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
transform = tfm.Compose(
[
tfm.RoomImpulseResponse(sources=["./audio/irs.csv"]),
tfm.BackgroundNoise(sources=["./audio/noises.csv"]),
], )
kwargs = transform.instantiate(seed, signal)
output = transform(signal, **kwargs)
# Due to precision issues with RoomImpulseResponse and BackgroundNoise used in the Compose test,
# we only perform logical testing for Compose and skip precision testing of the final output
# _compare_transform("Compose", output)
assert isinstance(transform[0], tfm.RoomImpulseResponse)
assert isinstance(transform[1], tfm.BackgroundNoise)
assert len(transform) == 2
# Make sure __iter__ works
for _tfm in transform:
pass
class MulTransform(tfm.BaseTransform):
def __init__(self, num, name=None):
self.num = num
super().__init__(name=name, keys=["num"])
def _transform(self, signal, num):
if not num.dim():
num = num.unsqueeze(axis=0)
signal.audio_data = signal.audio_data * num[:, None, None]
return signal
def _instantiate(self, state):
return {"num": self.num}
def test_compose_with_duplicate_transforms():
muls = [0.5, 0.25, 0.125]
transform = tfm.Compose([MulTransform(x) for x in muls])
full_mul = np.prod(muls)
kwargs = transform.instantiate(0)
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
output = transform(signal.clone(), **kwargs)
expected_output = signal.audio_data * full_mul
assert paddle.allclose(output.audio_data, expected_output)
def test_nested_compose():
muls = [0.5, 0.25, 0.125]
transform = tfm.Compose([
MulTransform(muls[0]),
tfm.Compose(
[MulTransform(muls[1]), tfm.Compose([MulTransform(muls[2])])]),
])
full_mul = np.prod(muls)
kwargs = transform.instantiate(0)
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
output = transform(signal.clone(), **kwargs)
expected_output = signal.audio_data * full_mul
assert paddle.allclose(output.audio_data, expected_output)
def test_compose_filtering():
muls = [0.5, 0.25, 0.125]
transform = tfm.Compose([MulTransform(x, name=str(x)) for x in muls])
kwargs = transform.instantiate(0)
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
for s in range(len(muls)):
for _ in range(10):
_muls = np.random.choice(muls, size=s, replace=False).tolist()
full_mul = np.prod(_muls)
with transform.filter(*[str(x) for x in _muls]):
output = transform(signal.clone(), **kwargs)
expected_output = signal.audio_data * full_mul
assert paddle.allclose(output.audio_data, expected_output)
def test_sequential_compose():
muls = [0.5, 0.25, 0.125]
transform = tfm.Compose([
tfm.Compose([MulTransform(muls[0])]),
tfm.Compose([MulTransform(muls[1]), MulTransform(muls[2])]),
])
full_mul = np.prod(muls)
kwargs = transform.instantiate(0)
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
output = transform(signal.clone(), **kwargs)
expected_output = signal.audio_data * full_mul
assert paddle.allclose(output.audio_data, expected_output)
def test_choose_basic():
seed = 0
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
transform = tfm.Choose([
tfm.RoomImpulseResponse(sources=["./audio/irs.csv"]),
tfm.BackgroundNoise(sources=["./audio/noises.csv"]),
])
kwargs = transform.instantiate(seed, signal)
output = transform(signal.clone(), **kwargs)
# Due to precision issues with RoomImpulseResponse and BackgroundNoise used in the Choose test,
# we only perform logical testing for Choose and skip precision testing of the final output
# _compare_transform("Choose", output)
transform = tfm.Choose([
MulTransform(0.0),
MulTransform(2.0),
])
targets = [signal.clone() * 0.0, signal.clone() * 2.0]
for seed in range(10):
kwargs = transform.instantiate(seed, signal)
output = transform(signal.clone(), **kwargs)
assert any([output == target for target in targets])
# Test that if you make a batch of signals and call it,
# the first item in the batch is still the same as above.
batch_size = 4
signal = AudioSignal(audio_path, offset=10, duration=2)
signal_batch = AudioSignal.batch(
[signal.clone() for _ in range(batch_size)])
states = [seed + idx for idx in list(range(batch_size))]
kwargs = transform.batch_instantiate(states, signal_batch)
batch_output = transform(signal_batch, **kwargs)
for nb in range(batch_size):
assert batch_output[nb] in targets
def test_choose_weighted():
seed = 0
audio_path = "./audio/spk/f10_script4_produced.wav"
transform = tfm.Choose(
[
MulTransform(0.0),
MulTransform(2.0),
],
weights=[0.0, 1.0], )
# Test that if you make a batch of signals and call it,
# the first item in the batch is still the same as above.
batch_size = 4
signal = AudioSignal(audio_path, offset=10, duration=2)
signal_batch = AudioSignal.batch(
[signal.clone() for _ in range(batch_size)])
targets = [signal.clone() * 0.0, signal.clone() * 2.0]
states = [seed + idx for idx in list(range(batch_size))]
kwargs = transform.batch_instantiate(states, signal_batch)
batch_output = transform(signal_batch, **kwargs)
for nb in range(batch_size):
assert batch_output[nb] == targets[1]
def test_choose_with_compose():
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
transform = tfm.Choose([
tfm.Compose([MulTransform(0.0)]),
tfm.Compose([MulTransform(2.0)]),
])
targets = [signal.clone() * 0.0, signal.clone() * 2.0]
for seed in range(10):
kwargs = transform.instantiate(seed, signal)
output = transform(signal, **kwargs)
assert output in targets
def test_repeat():
seed = 0
audio_path = "./audio/spk/f10_script4_produced.wav"
signal = AudioSignal(audio_path, offset=10, duration=2)
kwargs = {}
kwargs["transform"] = tfm.Compose(
tfm.FrequencyMask(),
tfm.TimeMask(), )
kwargs["n_repeat"] = 5
transform = tfm.Repeat(**kwargs)
kwargs = transform.instantiate(seed, signal)
output = transform(signal.clone(), **kwargs)
_compare_transform("Repeat", output)
kwargs = {}
kwargs["transform"] = tfm.Compose(
tfm.FrequencyMask(),
tfm.TimeMask(), )
kwargs["max_repeat"] = 10
transform = tfm.RepeatUpTo(**kwargs)
kwargs = transform.instantiate(seed, signal)
output = transform(signal.clone(), **kwargs)
_compare_transform("RepeatUpTo", output)
# Make sure repeat does what it says
transform = tfm.Repeat(MulTransform(0.5), n_repeat=3)
kwargs = transform.instantiate(seed, signal)
signal = AudioSignal(paddle.randn([1, 1, 100]).clip(1e-5), 44100)
output = transform(signal.clone(), **kwargs)
scale = (output.audio_data / signal.audio_data).mean()
assert scale == (0.5**3)
class DummyData(paddle.io.Dataset):
def __init__(self, audio_path):
super().__init__()
self.audio_path = audio_path
self.length = 100
self.transform = tfm.Silence(prob=0.5)
def __getitem__(self, idx):
state = util.random_state(idx)
signal = AudioSignal.salient_excerpt(
self.audio_path, state=state, duration=1.0).resample(44100)
item = self.transform.instantiate(state, signal=signal)
item["signal"] = signal
return item
def __len__(self):
return self.length
def test_masking():
dataset = DummyData("./audio/spk/f10_script4_produced.wav")
dataloader = paddle.io.DataLoader(
dataset,
batch_size=16,
num_workers=0,
collate_fn=util.collate, )
for batch in dataloader:
signal = batch.pop("signal")
original = signal.clone()
signal = dataset.transform(signal, **batch)
original = dataset.transform(original, **batch)
mask = batch["Silence"]["mask"]
zeros_ = paddle.zeros_like(signal[mask].audio_data)
original_ = original[~mask].audio_data
assert paddle.allclose(signal[mask].audio_data, zeros_)
assert paddle.allclose(original[~mask].audio_data, original_)
def test_nested_masking():
transform = tfm.Compose(
[
tfm.VolumeNorm(prob=0.5),
tfm.Silence(prob=0.9),
],
prob=0.9, )
loader = audiotools.data.datasets.AudioLoader(sources=["./audio/spk.csv"])
dataset = audiotools.data.datasets.AudioDataset(
loader,
44100,
n_examples=100,
transform=transform, )
dataloader = paddle.io.DataLoader(
dataset, num_workers=0, batch_size=10, collate_fn=dataset.collate)
for batch in dataloader:
batch = util.prepare_batch(batch, device="cpu")
signal = batch["signal"]
kwargs = batch["transform_args"]
with paddle.no_grad():
output = dataset.transform(signal, **kwargs)
def test_smoothing_edge_case():
transform = tfm.Smoothing()
zeros = paddle.zeros([1, 1, 44100])
signal = AudioSignal(zeros, 44100)
kwargs = transform.instantiate(0, signal)
output = transform(signal, **kwargs)
assert paddle.allclose(output.audio_data, zeros)
def test_global_volume_norm():
signal = AudioSignal.wave(440, 1, 44100, 1)
# signal with -inf loudness should be unchanged
signal.metadata["loudness"] = float("-inf")
transform = tfm.GlobalVolumeNorm(db=("const", -100))
kwargs = transform.instantiate(0, signal)
output = transform(signal.clone(), **kwargs)
assert paddle.allclose(output.samples, signal.samples)
# signal without a loudness key should be unchanged
signal.metadata.pop("loudness")
kwargs = transform.instantiate(0, signal)
output = transform(signal.clone(), **kwargs)
assert paddle.allclose(output.samples, signal.samples)
# signal with the actual loudness should be normalized
signal.metadata["loudness"] = signal.ffmpeg_loudness()
kwargs = transform.instantiate(0, signal)
output = transform(signal.clone(), **kwargs)
assert not paddle.allclose(output.samples, signal.samples)