# MIT License, Copyright (c) 2023-Present, Descript. # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Modified from audiotools(https://github.com/descriptinc/audiotools/blob/master/audiotools/core/loudness.py) import copy import math import typing import numpy as np import paddle import paddle.nn.functional as F import scipy from . import _julius def _unfold1d(x, kernel_size, stride): # https://github.com/PaddlePaddle/Paddle/pull/70102 """1D only unfolding similar to the one from Paddlepaddle. Given an _input tensor of size `[*, T]` this will return a tensor `[*, F, K]` with `K` the kernel size, and `F` the number of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`. This will automatically pad the _input to cover at least once all entries in `_input`. Args: _input (Tensor): tensor for which to return the frames. kernel_size (int): size of each frame. stride (int): stride between each frame. Shape: - Inputs: `_input` is `[*, T]` - Output: `[*, F, kernel_size]` with `F = 1 + ceil((T - kernel_size) / stride)` """ if 3 != x.dim(): raise NotImplementedError N, C, length = x.shape x = x.reshape([N * C, 1, length]) n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1 tgt_length = (n_frames - 1) * stride + kernel_size x = F.pad(x, (0, tgt_length - length), data_format="NCL") x = x.unsqueeze(-1) unfolded = paddle.nn.functional.unfold( x, kernel_sizes=[kernel_size, 1], strides=[stride, 1], ) unfolded = unfolded.transpose([0, 2, 1]) unfolded = unfolded.reshape([N, C, *unfolded.shape[1:]]) return unfolded class Meter(paddle.nn.Layer): """Tensorized version of pyloudnorm.Meter. Works with batched audio tensors. Parameters ---------- rate : int Sample rate of audio. filter_class : str, optional Class of weighting filter used. K-weighting' (default), 'Fenton/Lee 1' 'Fenton/Lee 2', 'Dash et al.' by default "K-weighting" block_size : float, optional Gating block size in seconds, by default 0.400 zeros : int, optional Number of zeros to use in FIR approximation of IIR filters, by default 512 use_fir : bool, optional Whether to use FIR approximation or exact IIR formulation. If computing on GPU, ``use_fir=True`` will be used, as its much faster, by default False """ def __init__( self, rate: int, filter_class: str="K-weighting", block_size: float=0.400, zeros: int=512, use_fir: bool=False, ): super().__init__() self.rate = rate self.filter_class = filter_class self.block_size = block_size self.use_fir = use_fir G = paddle.to_tensor( np.array([1.0, 1.0, 1.0, 1.41, 1.41]), stop_gradient=True) self.register_buffer("G", G) # Compute impulse responses so that filtering is fast via # a convolution at runtime, on GPU, unlike lfilter. impulse = np.zeros((zeros, )) impulse[..., 0] = 1.0 firs = np.zeros((len(self._filters), 1, zeros)) # passband_gain = torch.zeros(len(self._filters)) passband_gain = paddle.zeros([len(self._filters)], dtype="float32") for i, (_, filter_stage) in enumerate(self._filters.items()): firs[i] = scipy.signal.lfilter(filter_stage.b, filter_stage.a, impulse) passband_gain[i] = filter_stage.passband_gain firs = paddle.to_tensor( firs[..., ::-1].copy(), dtype="float32", stop_gradient=True) self.register_buffer("firs", firs) self.register_buffer("passband_gain", passband_gain) def apply_filter_gpu(self, data: paddle.Tensor): """Performs FIR approximation of loudness computation. Parameters ---------- data : paddle.Tensor Audio data of shape (nb, nch, nt). Returns ------- paddle.Tensor Filtered audio data. """ # Data is of shape (nb, nch, nt) # Reshape to (nb*nch, 1, nt) nb, nt, nch = data.shape data = data.transpose([0, 2, 1]) data = data.reshape([nb * nch, 1, nt]) # Apply padding pad_length = self.firs.shape[-1] # Apply filtering in sequence for i in range(self.firs.shape[0]): data = F.pad(data, (pad_length, pad_length), data_format="NCL") data = _julius.fft_conv1d(data, self.firs[i, None, ...]) data = self.passband_gain[i] * data data = data[..., 1:nt + 1] data = data.transpose([0, 2, 1]) data = data[:, :nt, :] return data @staticmethod def scipy_lfilter(waveform, a_coeffs, b_coeffs, clamp: bool=True): # 使用 scipy.signal.lfilter 进行滤波(处理三维数据) output = np.zeros_like(waveform) for batch_idx in range(waveform.shape[0]): for channel_idx in range(waveform.shape[2]): output[batch_idx, :, channel_idx] = scipy.signal.lfilter( b_coeffs, a_coeffs, waveform[batch_idx, :, channel_idx]) return output def apply_filter_cpu(self, data: paddle.Tensor): """Performs IIR formulation of loudness computation. Parameters ---------- data : paddle.Tensor Audio data of shape (nb, nch, nt). Returns ------- paddle.Tensor Filtered audio data. """ _data = data.cpu().numpy().copy() for _, filter_stage in self._filters.items(): passband_gain = filter_stage.passband_gain a_coeffs = filter_stage.a b_coeffs = filter_stage.b filtered = self.scipy_lfilter(_data, a_coeffs, b_coeffs) _data[:] = passband_gain * filtered data = paddle.to_tensor(_data) return data def apply_filter(self, data: paddle.Tensor): """Applies filter on either CPU or GPU, depending on if the audio is on GPU or is on CPU, or if ``self.use_fir`` is True. Parameters ---------- data : paddle.Tensor Audio data of shape (nb, nch, nt). Returns ------- paddle.Tensor Filtered audio data. """ # if data.place.is_gpu_place() or self.use_fir: # data = self.apply_filter_gpu(data) # else: # data = self.apply_filter_cpu(data) data = self.apply_filter_cpu(data) return data def forward(self, data: paddle.Tensor): """Computes integrated loudness of data. Parameters ---------- data : paddle.Tensor Audio data of shape (nb, nch, nt). Returns ------- paddle.Tensor Filtered audio data. """ return self.integrated_loudness(data) def _unfold(self, input_data): T_g = self.block_size overlap = 0.75 # overlap of 75% of the block duration step = 1.0 - overlap # step size by percentage kernel_size = int(T_g * self.rate) stride = int(T_g * self.rate * step) unfolded = _unfold1d( input_data.transpose([0, 2, 1]), kernel_size, stride) unfolded = unfolded.transpose([0, 1, 3, 2]) return unfolded def integrated_loudness(self, data: paddle.Tensor): """Computes integrated loudness of data. Parameters ---------- data : paddle.Tensor Audio data of shape (nb, nch, nt). Returns ------- paddle.Tensor Filtered audio data. """ if not paddle.is_tensor(data): data = paddle.to_tensor(data, dtype="float32") else: data = data.astype("float32") input_data = data.clone() # Data always has a batch and channel dimension. # Is of shape (nb, nt, nch) if input_data.ndim < 2: input_data = input_data.unsqueeze(-1) if input_data.ndim < 3: input_data = input_data.unsqueeze(0) nb, nt, nch = input_data.shape # Apply frequency weighting filters - account # for the acoustic respose of the head and auditory system input_data = self.apply_filter(input_data) G = self.G # channel gains T_g = self.block_size # 400 ms gating block standard Gamma_a = -70.0 # -70 LKFS = absolute loudness threshold unfolded = self._unfold(input_data) z = (1.0 / (T_g * self.rate)) * unfolded.square().sum(2) l = -0.691 + 10.0 * paddle.log10( (G[None, :nch, None] * z).sum(1, keepdim=True)) l = l.expand_as(z) # find gating block indices above absolute threshold z_avg_gated = z z_avg_gated[l <= Gamma_a] = 0 masked = l > Gamma_a z_avg_gated = z_avg_gated.sum(2) / masked.sum(2).astype("float32") # calculate the relative threshold value (see eq. 6) Gamma_r = -0.691 + 10.0 * paddle.log10( (z_avg_gated * G[None, :nch]).sum(-1)) - 10.0 Gamma_r = Gamma_r[:, None, None] Gamma_r = Gamma_r.expand([nb, nch, l.shape[-1]]) # find gating block indices above relative and absolute thresholds (end of eq. 7) z_avg_gated = z z_avg_gated[l <= Gamma_a] = 0 z_avg_gated[l <= Gamma_r] = 0 masked = (l > Gamma_a) * (l > Gamma_r) z_avg_gated = z_avg_gated.sum(2) / (masked.sum(2) + 10e-6) # TODO Currently, paddle has a segmentation fault bug in this section of the code # z_avg_gated = paddle.nan_to_num(z_avg_gated) # z_avg_gated = paddle.where( # paddle.isnan(z_avg_gated), # paddle.zeros_like(z_avg_gated), z_avg_gated) z_avg_gated[z_avg_gated == float("inf")] = float( np.finfo(np.float32).max) z_avg_gated[z_avg_gated == -float("inf")] = float( np.finfo(np.float32).min) LUFS = -0.691 + 10.0 * paddle.log10( (G[None, :nch] * z_avg_gated).sum(1)) return LUFS.astype("float32") @property def filter_class(self): return self._filter_class @filter_class.setter def filter_class(self, value): from pyloudnorm import Meter meter = Meter(self.rate) meter.filter_class = value self._filter_class = value self._filters = meter._filters class LoudnessMixin: _loudness = None MIN_LOUDNESS = -70 """Minimum loudness possible.""" def loudness(self, filter_class: str="K-weighting", block_size: float=0.400, **kwargs): """Calculates loudness using an implementation of ITU-R BS.1770-4. Allows control over gating block size and frequency weighting filters for additional control. Measure the integrated gated loudness of a signal. API is derived from PyLoudnorm, but this implementation is ported to PyTorch and is tensorized across batches. When on GPU, an FIR approximation of the IIR filters is used to compute loudness for speed. Uses the weighting filters and block size defined by the meter the integrated loudness is measured based upon the gating algorithm defined in the ITU-R BS.1770-4 specification. Parameters ---------- filter_class : str, optional Class of weighting filter used. K-weighting' (default), 'Fenton/Lee 1' 'Fenton/Lee 2', 'Dash et al.' by default "K-weighting" block_size : float, optional Gating block size in seconds, by default 0.400 kwargs : dict, optional Keyword arguments to :py:func:`audiotools.core.loudness.Meter`. Returns ------- paddle.Tensor Loudness of audio data. """ if self._loudness is not None: return self._loudness # .to(self.device) original_length = self.signal_length if self.signal_duration < 0.5: pad_len = int((0.5 - self.signal_duration) * self.sample_rate) self.zero_pad(0, pad_len) # create BS.1770 meter meter = Meter( self.sample_rate, filter_class=filter_class, block_size=block_size, **kwargs) # meter = meter.to(self.device) # measure loudness loudness = meter.integrated_loudness( self.audio_data.transpose([0, 2, 1])) self.truncate_samples(original_length) min_loudness = paddle.ones_like(loudness) * self.MIN_LOUDNESS self._loudness = paddle.maximum(loudness, min_loudness) return self._loudness # .to(self.device)