You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
322 lines
10 KiB
322 lines
10 KiB
# MIT License, Copyright (c) 2023-Present, Descript.
|
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Modified from audiotools(https://github.com/descriptinc/audiotools/blob/master/tests/core/test_effects.py)
|
|
import sys
|
|
|
|
import numpy as np
|
|
import paddle
|
|
import pytest
|
|
|
|
from audio.audiotools import AudioSignal
|
|
|
|
|
|
def test_normalize():
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
signal = AudioSignal(audio_path, offset=10, duration=10)
|
|
signal = signal.normalize()
|
|
assert np.allclose(signal.loudness(), -24, atol=1e-1)
|
|
|
|
array = np.random.randn(1, 2, 32000)
|
|
array = array / np.abs(array).max()
|
|
|
|
signal = AudioSignal(array, sample_rate=16000)
|
|
for db_incr in np.arange(10, 75, 5):
|
|
db = -80 + db_incr
|
|
signal = signal.normalize(db)
|
|
loudness = signal.loudness()
|
|
assert np.allclose(loudness, db, atol=1) # TODO, atol=1e-1
|
|
|
|
batch_size = 16
|
|
db = -60 + paddle.linspace(10, 30, batch_size)
|
|
|
|
array = np.random.randn(batch_size, 2, 32000)
|
|
array = array / np.abs(array).max()
|
|
signal = AudioSignal(array, sample_rate=16000)
|
|
|
|
signal = signal.normalize(db)
|
|
assert np.allclose(signal.loudness(), db, 1e-1)
|
|
|
|
|
|
def test_volume_change():
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
signal = AudioSignal(audio_path, offset=10, duration=10)
|
|
|
|
boost = 3
|
|
before_db = signal.loudness().clone()
|
|
signal = signal.volume_change(boost)
|
|
after_db = signal.loudness()
|
|
assert np.allclose(before_db + boost, after_db)
|
|
|
|
signal._loudness = None
|
|
after_db = signal.loudness()
|
|
assert np.allclose(before_db + boost, after_db, 1e-1)
|
|
|
|
|
|
def test_mix():
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=10)
|
|
|
|
audio_path = "./audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
|
|
nz = AudioSignal(audio_path, offset=10, duration=10)
|
|
|
|
spk.deepcopy().mix(nz, snr=-10)
|
|
snr = spk.loudness() - nz.loudness()
|
|
assert np.allclose(snr, -10, atol=1)
|
|
|
|
# Test in batch
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=10)
|
|
|
|
audio_path = "./audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
|
|
nz = AudioSignal(audio_path, offset=10, duration=10)
|
|
|
|
batch_size = 4
|
|
tgt_snr = paddle.linspace(-10, 10, batch_size)
|
|
|
|
spk_batch = AudioSignal.batch([spk.deepcopy() for _ in range(batch_size)])
|
|
nz_batch = AudioSignal.batch([nz.deepcopy() for _ in range(batch_size)])
|
|
|
|
spk_batch.deepcopy().mix(nz_batch, snr=tgt_snr)
|
|
snr = spk_batch.loudness() - nz_batch.loudness()
|
|
assert np.allclose(snr, tgt_snr, atol=1)
|
|
|
|
# Test with "EQing" the other signal
|
|
db = 0 + 0 * paddle.rand([10])
|
|
spk_batch.deepcopy().mix(nz_batch, snr=tgt_snr, other_eq=db)
|
|
snr = spk_batch.loudness() - nz_batch.loudness()
|
|
assert np.allclose(snr, tgt_snr, atol=1)
|
|
|
|
|
|
def test_convolve():
|
|
np.random.seed(6) # Found a failing seed
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=10)
|
|
|
|
impulse = np.zeros((1, 16000), dtype="float32")
|
|
impulse[..., 0] = 1
|
|
ir = AudioSignal(impulse, 16000)
|
|
batch_size = 4
|
|
|
|
spk_batch = AudioSignal.batch([spk.deepcopy() for _ in range(batch_size)])
|
|
ir_batch = AudioSignal.batch(
|
|
[
|
|
ir.deepcopy().zero_pad(np.random.randint(1000), 0)
|
|
for _ in range(batch_size)
|
|
],
|
|
pad_signals=True, )
|
|
|
|
convolved = spk_batch.deepcopy().convolve(ir_batch)
|
|
assert convolved == spk_batch
|
|
|
|
# Short duration
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=0.1)
|
|
|
|
impulse = np.zeros((1, 16000), dtype="float32")
|
|
impulse[..., 0] = 1
|
|
ir = AudioSignal(impulse, 16000)
|
|
batch_size = 4
|
|
|
|
spk_batch = AudioSignal.batch([spk.deepcopy() for _ in range(batch_size)])
|
|
ir_batch = AudioSignal.batch(
|
|
[
|
|
ir.deepcopy().zero_pad(np.random.randint(1000), 0)
|
|
for _ in range(batch_size)
|
|
],
|
|
pad_signals=True, )
|
|
|
|
convolved = spk_batch.deepcopy().convolve(ir_batch)
|
|
assert convolved == spk_batch
|
|
|
|
|
|
def test_pipeline():
|
|
# An actual IR, no batching
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=5)
|
|
|
|
audio_path = "./audio/ir/h179_Bar_1txts.wav"
|
|
ir = AudioSignal(audio_path)
|
|
spk.deepcopy().convolve(ir)
|
|
|
|
audio_path = "./audio/nz/f5_script2_ipad_balcony1_room_tone.wav"
|
|
nz = AudioSignal(audio_path, offset=10, duration=5)
|
|
|
|
batch_size = 16
|
|
tgt_snr = paddle.linspace(20, 30, batch_size)
|
|
|
|
(spk @ ir).mix(nz, snr=tgt_snr)
|
|
|
|
|
|
@pytest.mark.parametrize("n_bands", [1, 2, 4, 8, 12, 16])
|
|
def test_mel_filterbank(n_bands):
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=1)
|
|
fbank = spk.deepcopy().mel_filterbank(n_bands)
|
|
|
|
assert paddle.allclose(fbank.sum(-1), spk.audio_data, atol=1e-6)
|
|
|
|
# Check if it works in batches.
|
|
spk_batch = AudioSignal.batch([
|
|
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
|
|
for _ in range(16)
|
|
])
|
|
fbank = spk_batch.deepcopy().mel_filterbank(n_bands)
|
|
summed = fbank.sum(-1)
|
|
assert paddle.allclose(summed, spk_batch.audio_data, atol=1e-6)
|
|
|
|
|
|
@pytest.mark.parametrize("n_bands", [1, 2, 4, 8, 12, 16])
|
|
def test_equalizer(n_bands):
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=10)
|
|
|
|
db = -3 + 1 * paddle.rand([n_bands])
|
|
spk.deepcopy().equalizer(db)
|
|
|
|
db = -3 + 1 * np.random.rand(n_bands)
|
|
spk.deepcopy().equalizer(db)
|
|
|
|
audio_path = "./audio/ir/h179_Bar_1txts.wav"
|
|
ir = AudioSignal(audio_path)
|
|
db = -3 + 1 * paddle.rand([n_bands])
|
|
|
|
spk.deepcopy().convolve(ir.equalizer(db))
|
|
|
|
spk_batch = AudioSignal.batch([
|
|
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
|
|
for _ in range(16)
|
|
])
|
|
|
|
db = paddle.zeros([spk_batch.batch_size, n_bands])
|
|
output = spk_batch.deepcopy().equalizer(db)
|
|
|
|
assert output == spk_batch
|
|
|
|
|
|
def test_clip_distortion():
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=2)
|
|
clipped = spk.deepcopy().clip_distortion(0.05)
|
|
|
|
spk_batch = AudioSignal.batch([
|
|
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
|
|
for _ in range(16)
|
|
])
|
|
percs = paddle.to_tensor(np.random.uniform(size=(16, ))).astype("float32")
|
|
clipped_batch = spk_batch.deepcopy().clip_distortion(percs)
|
|
|
|
assert clipped.audio_data.abs().max() < 1.0
|
|
assert clipped_batch.audio_data.abs().max() < 1.0
|
|
|
|
|
|
@pytest.mark.parametrize("quant_ch", [2, 4, 8, 16, 32, 64, 128])
|
|
def test_quantization(quant_ch):
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=2)
|
|
|
|
quantized = spk.deepcopy().quantization(quant_ch)
|
|
|
|
# Need to round audio_data off because torch ops with straight
|
|
# through estimator are sometimes a bit off past 3 decimal places.
|
|
found_quant_ch = len(np.unique(np.around(quantized.audio_data, decimals=3)))
|
|
assert found_quant_ch <= quant_ch
|
|
|
|
spk_batch = AudioSignal.batch([
|
|
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
|
|
for _ in range(16)
|
|
])
|
|
|
|
quant_ch = np.random.choice(
|
|
[2, 4, 8, 16, 32, 64, 128], size=(16, ), replace=True)
|
|
quantized = spk_batch.deepcopy().quantization(quant_ch)
|
|
|
|
for i, q_ch in enumerate(quant_ch):
|
|
found_quant_ch = len(
|
|
np.unique(np.around(quantized.audio_data[i], decimals=3)))
|
|
assert found_quant_ch <= q_ch
|
|
|
|
|
|
@pytest.mark.parametrize("quant_ch", [2, 4, 8, 16, 32, 64, 128])
|
|
def test_mulaw_quantization(quant_ch):
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
spk = AudioSignal(audio_path, offset=10, duration=2)
|
|
|
|
quantized = spk.deepcopy().mulaw_quantization(quant_ch)
|
|
|
|
# Need to round audio_data off because torch ops with straight
|
|
# through estimator are sometimes a bit off past 3 decimal places.
|
|
found_quant_ch = len(np.unique(np.around(quantized.audio_data, decimals=3)))
|
|
assert found_quant_ch <= quant_ch
|
|
|
|
spk_batch = AudioSignal.batch([
|
|
AudioSignal.excerpt("./audio/spk/f10_script4_produced.wav", duration=2)
|
|
for _ in range(16)
|
|
])
|
|
|
|
quant_ch = np.random.choice(
|
|
[2, 4, 8, 16, 32, 64, 128], size=(16, ), replace=True)
|
|
quantized = spk_batch.deepcopy().mulaw_quantization(quant_ch)
|
|
|
|
for i, q_ch in enumerate(quant_ch):
|
|
found_quant_ch = len(
|
|
np.unique(np.around(quantized.audio_data[i], decimals=3)))
|
|
assert found_quant_ch <= q_ch
|
|
|
|
|
|
def test_impulse_response_augmentation():
|
|
audio_path = "./audio/ir/h179_Bar_1txts.wav"
|
|
batch_size = 16
|
|
ir = AudioSignal(audio_path)
|
|
ir_batch = AudioSignal.batch([ir for _ in range(batch_size)])
|
|
early_response, late_field, window = ir_batch.decompose_ir()
|
|
|
|
assert early_response.shape == late_field.shape
|
|
assert late_field.shape == window.shape
|
|
|
|
drr = ir_batch.measure_drr()
|
|
|
|
alpha = AudioSignal.solve_alpha(early_response, late_field, window, drr)
|
|
assert np.allclose(alpha, np.ones_like(alpha), 1e-5)
|
|
|
|
target_drr = 5
|
|
out = ir_batch.deepcopy().alter_drr(target_drr)
|
|
drr = out.measure_drr()
|
|
assert np.allclose(drr, np.ones_like(drr) * target_drr)
|
|
|
|
target_drr = np.random.rand(batch_size).astype("float32") * 50
|
|
altered_ir = ir_batch.deepcopy().alter_drr(target_drr)
|
|
drr = altered_ir.measure_drr()
|
|
assert np.allclose(drr.flatten(), target_drr.flatten())
|
|
|
|
|
|
def test_apply_ir():
|
|
audio_path = "./audio/spk/f10_script4_produced.wav"
|
|
ir_path = "./audio/ir/h179_Bar_1txts.wav"
|
|
|
|
spk = AudioSignal(audio_path, offset=10, duration=2)
|
|
ir = AudioSignal(ir_path)
|
|
db = 0 + 0 * paddle.rand([10])
|
|
output = spk.deepcopy().apply_ir(ir, drr=10, ir_eq=db)
|
|
|
|
assert np.allclose(ir.measure_drr().flatten(), 10)
|
|
|
|
output = spk.deepcopy().apply_ir(
|
|
ir, drr=10, ir_eq=db, use_original_phase=True)
|
|
|
|
|
|
def test_ensure_max_of_audio():
|
|
spk = AudioSignal(paddle.randn([1, 1, 44100]), 44100)
|
|
|
|
max_vals = [1.0] + [np.random.rand() for _ in range(10)]
|
|
for val in max_vals:
|
|
after = spk.deepcopy().ensure_max_of_audio(val)
|
|
assert after.audio_data.abs().max() <= val + 1e-3
|
|
|
|
# Make sure it does nothing to a tiny signal
|
|
spk = AudioSignal(paddle.rand([1, 1, 44100]), 44100)
|
|
spk.audio_data = spk.audio_data * 0.5
|
|
after = spk.deepcopy().ensure_max_of_audio()
|
|
|
|
assert paddle.allclose(after.audio_data, spk.audio_data)
|