PaddleSpeech/paddlespeech/audiotools/core/_julius.py

# MIT License, Copyright (c) 2020 Alexandre Défossez.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Modified from julius(https://github.com/adefossez/julius/tree/main/julius)
"""
Implementation of a FFT based 1D convolution in PaddlePaddle.
While FFT is used in some cases for small kernel sizes, it is not the default for long ones, e.g. 512.
This module implements efficient FFT based convolutions for such cases. A typical
application is for evaluating FIR filters with a long receptive field, typically
evaluated with a stride of 1.
"""
import inspect
import math
import sys
import typing
from typing import Optional
from typing import Sequence

import paddle
import paddle.nn as nn
import paddle.nn.functional as F

from paddlespeech.t2s.modules import fft_conv1d
from paddlespeech.t2s.modules import FFTConv1D
from paddlespeech.utils import satisfy_paddle_version

__all__ = [
    'highpass_filter', 'highpass_filters', 'lowpass_filter', 'LowPassFilter',
    'LowPassFilters', 'pure_tone', 'resample_frac', 'split_bands', 'SplitBands'
]


def simple_repr(obj, attrs: Optional[Sequence[str]]=None, overrides: dict={}):
    """
    Return a simple representation string for `obj`.
    If `attrs` is not None, it should be a list of attributes to include.
    """
    params = inspect.signature(obj.__class__).parameters
    attrs_repr = []
    if attrs is None:
        attrs = list(params.keys())
    for attr in attrs:
        display = False
        if attr in overrides:
            value = overrides[attr]
        elif hasattr(obj, attr):
            value = getattr(obj, attr)
        else:
            continue
        if attr in params:
            param = params[attr]
            if param.default is inspect._empty or value != param.default:  # type: ignore
                display = True
        else:
            display = True

        if display:
            attrs_repr.append(f"{attr}={value}")
    return f"{obj.__class__.__name__}({','.join(attrs_repr)})"


def sinc(x: paddle.Tensor):
    """
    Implementation of sinc, i.e. sin(x) / x

    __Warning__: the input is not multiplied by `pi`!
    """
    if satisfy_paddle_version("3.0"):
        return paddle.sinc(x)

    return paddle.where(
        x == 0,
        paddle.to_tensor(1.0, dtype=x.dtype, place=x.place),
        paddle.sin(x) / x, )


class ResampleFrac(paddle.nn.Layer):
    """
    Resampling from the sample rate `old_sr` to `new_sr`.
    """

    def __init__(self,
                 old_sr: int,
                 new_sr: int,
                 zeros: int=24,
                 rolloff: float=0.945):
        """
        Args:
            old_sr (int): sample rate of the input signal x.
            new_sr (int): sample rate of the output.
            zeros (int): number of zero crossing to keep in the sinc filter.
            rolloff (float): use a lowpass filter that is `rolloff * new_sr / 2`,
                to ensure sufficient margin due to the imperfection of the FIR filter used.
                Lowering this value will reduce anti-aliasing, but will reduce some of the
                highest frequencies.

        Shape:

            - Input: `[*, T]`
            - Output: `[*, T']` with `T' = int(new_sr * T / old_sr)`


        .. caution::
            After dividing `old_sr` and `new_sr` by their GCD, both should be small
            for this implementation to be fast.

        >>> import paddle
        >>> resample = ResampleFrac(4, 5)
        >>> x = paddle.randn([1000])
        >>> print(len(resample(x)))
        1250
        """
        super().__init__()
        if not isinstance(old_sr, int) or not isinstance(new_sr, int):
            raise ValueError("old_sr and new_sr should be integers")
        gcd = math.gcd(old_sr, new_sr)
        self.old_sr = old_sr // gcd
        self.new_sr = new_sr // gcd
        self.zeros = zeros
        self.rolloff = rolloff

        self._init_kernels()

    def _init_kernels(self):
        if self.old_sr == self.new_sr:
            return

        kernels = []
        sr = min(self.new_sr, self.old_sr)
        sr *= self.rolloff

        self._width = math.ceil(self.zeros * self.old_sr / sr)
        idx = paddle.arange(
            -self._width, self._width + self.old_sr, dtype="float32")
        for i in range(self.new_sr):
            t = (-i / self.new_sr + idx / self.old_sr) * sr
            t = paddle.clip(t, -self.zeros, self.zeros)
            t *= math.pi
            window = paddle.cos(t / self.zeros / 2)**2
            kernel = sinc(t) * window
            # Renormalize kernel to ensure a constant signal is preserved.
            kernel = kernel / kernel.sum()
            kernels.append(kernel)

        _kernel = paddle.stack(kernels).reshape([self.new_sr, 1, -1])
        self.kernel = self.create_parameter(
            shape=_kernel.shape,
            dtype=_kernel.dtype, )
        self.kernel.set_value(_kernel)

    def forward(
            self,
            x: paddle.Tensor,
            output_length: Optional[int]=None,
            full: bool=False, ):
        """
        Resample x.
        Args:
            x (Tensor): signal to resample, time should be the last dimension
            output_length (None or int): This can be set to the desired output length
                (last dimension). Allowed values are between 0 and
                ceil(length * new_sr / old_sr). When None (default) is specified, the
                floored output length will be used. In order to select the largest possible
                size, use the `full` argument.
            full (bool): return the longest possible output from the input. This can be useful
                if you chain resampling operations, and want to give the `output_length` only
                for the last one, while passing `full=True` to all the other ones.
        """
        if self.old_sr == self.new_sr:
            return x
        shape = x.shape
        _dtype = x.dtype
        length = x.shape[-1]
        x = x.reshape([-1, length])
        x = F.pad(
            x.unsqueeze(1),
            [self._width, self._width + self.old_sr],
            mode="replicate",
            data_format="NCL", ).astype(self.kernel.dtype)
        ys = F.conv1d(x, self.kernel, stride=self.old_sr, data_format="NCL")
        y = ys.transpose(
            [0, 2, 1]).reshape(list(shape[:-1]) + [-1]).astype(_dtype)

        float_output_length = paddle.to_tensor(
            self.new_sr * length / self.old_sr, dtype="float32")
        max_output_length = paddle.ceil(float_output_length).astype("int64")
        default_output_length = paddle.floor(float_output_length).astype(
            "int64")

        if output_length is None:
            applied_output_length = (max_output_length
                                     if full else default_output_length)
        elif output_length < 0 or output_length > max_output_length:
            raise ValueError(
                f"output_length must be between 0 and {max_output_length.numpy()}"
            )
        else:
            applied_output_length = paddle.to_tensor(
                output_length, dtype="int64")
            if full:
                raise ValueError(
                    "You cannot pass both full=True and output_length")
        return y[..., :applied_output_length]

    def __repr__(self):
        return simple_repr(self)


def resample_frac(
        x: paddle.Tensor,
        old_sr: int,
        new_sr: int,
        zeros: int=24,
        rolloff: float=0.945,
        output_length: Optional[int]=None,
        full: bool=False, ):
    """
    Functional version of `ResampleFrac`, refer to its documentation for more information.

    ..warning::
        If you call repeatidly this functions with the same sample rates, then the
        resampling kernel will be recomputed everytime. For best performance, you should use
        and cache an instance of `ResampleFrac`.
    """
    return ResampleFrac(old_sr, new_sr, zeros, rolloff)(x, output_length, full)


def pad_to(tensor: paddle.Tensor,
           target_length: int,
           mode: str="constant",
           value: float=0.0):
    """
    Pad the given tensor to the given length, with 0s on the right.
    """
    return F.pad(
        tensor, (0, target_length - tensor.shape[-1]),
        mode=mode,
        value=value,
        data_format="NCL")


def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):
    """
    Return a pure tone, i.e. cosine.

    Args:
        freq (float): frequency (in Hz)
        sr (float): sample rate (in Hz)
        dur (float): duration (in seconds)
    """
    time = paddle.arange(int(sr * dur), dtype="float32") / sr
    return paddle.cos(2 * math.pi * freq * time)


class LowPassFilters(nn.Layer):
    """
    Bank of low pass filters.
    """

    def __init__(self,
                 cutoffs: Sequence[float],
                 stride: int=1,
                 pad: bool=True,
                 zeros: float=8,
                 fft: Optional[bool]=None,
                 dtype="float32"):
        super().__init__()
        self.cutoffs = list(cutoffs)
        if min(self.cutoffs) < 0:
            raise ValueError("Minimum cutoff must be larger than zero.")
        if max(self.cutoffs) > 0.5:
            raise ValueError("A cutoff above 0.5 does not make sense.")
        self.stride = stride
        self.pad = pad
        self.zeros = zeros
        self.half_size = int(zeros / min([c for c in self.cutoffs if c > 0]) /
                             2)
        if fft is None:
            fft = self.half_size > 32
        self.fft = fft

        # Create filters
        window = paddle.audio.functional.get_window(
            "hann", 2 * self.half_size + 1, fftbins=False, dtype=dtype)
        time = paddle.arange(
            -self.half_size, self.half_size + 1, dtype="float32")
        filters = []
        for cutoff in cutoffs:
            if cutoff == 0:
                filter_ = paddle.zeros_like(time)
            else:
                filter_ = 2 * cutoff * window * sinc(2 * cutoff * math.pi *
                                                     time)
                # Normalize filter
                filter_ /= paddle.sum(filter_)
            filters.append(filter_)
        filters = paddle.stack(filters)[:, None]
        self.filters = self.create_parameter(
            shape=filters.shape,
            default_initializer=nn.initializer.Constant(value=0.0),
            dtype="float32",
            is_bias=False,
            attr=paddle.ParamAttr(trainable=False), )
        self.filters.set_value(filters)

    def forward(self, _input):
        shape = list(_input.shape)
        _input = _input.reshape([-1, 1, shape[-1]])
        if self.pad:
            _input = F.pad(
                _input, (self.half_size, self.half_size),
                mode="replicate",
                data_format="NCL")
        if self.fft:
            out = fft_conv1d(_input, self.filters, stride=self.stride)
        else:
            out = F.conv1d(_input, self.filters, stride=self.stride)

        shape.insert(0, len(self.cutoffs))
        shape[-1] = out.shape[-1]
        return out.transpose([1, 0, 2]).reshape(shape)


class LowPassFilter(nn.Layer):
    """
    Same as `LowPassFilters` but applies a single low pass filter.
    """

    def __init__(self,
                 cutoff: float,
                 stride: int=1,
                 pad: bool=True,
                 zeros: float=8,
                 fft: Optional[bool]=None):
        super().__init__()
        self._lowpasses = LowPassFilters([cutoff], stride, pad, zeros, fft)

    @property
    def cutoff(self):
        return self._lowpasses.cutoffs[0]

    @property
    def stride(self):
        return self._lowpasses.stride

    @property
    def pad(self):
        return self._lowpasses.pad

    @property
    def zeros(self):
        return self._lowpasses.zeros

    @property
    def fft(self):
        return self._lowpasses.fft

    def forward(self, _input):
        return self._lowpasses(_input)[0]


def lowpass_filters(
        _input: paddle.Tensor,
        cutoffs: Sequence[float],
        stride: int=1,
        pad: bool=True,
        zeros: float=8,
        fft: Optional[bool]=None, ):
    """
    Functional version of `LowPassFilters`, refer to this class for more information.
    """
    return LowPassFilters(cutoffs, stride, pad, zeros, fft)(_input)


def lowpass_filter(_input: paddle.Tensor,
                   cutoff: float,
                   stride: int=1,
                   pad: bool=True,
                   zeros: float=8,
                   fft: Optional[bool]=None):
    """
    Same as `lowpass_filters` but with a single cutoff frequency.
    Output will not have a dimension inserted in the front.
    """
    return lowpass_filters(_input, [cutoff], stride, pad, zeros, fft)[0]


class HighPassFilters(paddle.nn.Layer):
    """
    Bank of high pass filters. See `julius.lowpass.LowPassFilters` for more
    details on the implementation.

    Args:
        cutoffs (list[float]): list of cutoff frequencies, in [0, 0.5] expressed as `f/f_s` where
            f_s is the samplerate and `f` is the cutoff frequency.
            The upper limit is 0.5, because a signal sampled at `f_s` contains only
            frequencies under `f_s / 2`.
        stride (int): how much to decimate the output. Probably not a good idea
            to do so with a high pass filters though...
        pad (bool): if True, appropriately pad the _input with zero over the edge. If `stride=1`,
            the output will have the same length as the _input.
        zeros (float): Number of zero crossings to keep.
            Controls the receptive field of the Finite Impulse Response filter.
            For filters with low cutoff frequency, e.g. 40Hz at 44.1kHz,
            it is a bad idea to set this to a high value.
            This is likely appropriate for most use. Lower values
            will result in a faster filter, but with a slower attenuation around the
            cutoff frequency.
        fft (bool or None): if True, uses `julius.fftconv` rather than PyTorch convolutions.
            If False, uses PyTorch convolutions. If None, either one will be chosen automatically
            depending on the effective filter size.


    ..warning::
        All the filters will use the same filter size, aligned on the lowest
        frequency provided. If you combine a lot of filters with very diverse frequencies, it might
        be more efficient to split them over multiple modules with similar frequencies.

    Shape:

        - Input: `[*, T]`
        - Output: `[F, *, T']`, with `T'=T` if `pad` is True and `stride` is 1, and
            `F` is the numer of cutoff frequencies.

    >>> highpass = HighPassFilters([1/4])
    >>> x = paddle.randn([4, 12, 21, 1024])
    >>> list(highpass(x).shape)
    [1, 4, 12, 21, 1024]
    """

    def __init__(self,
                 cutoffs: Sequence[float],
                 stride: int=1,
                 pad: bool=True,
                 zeros: float=8,
                 fft: Optional[bool]=None):
        super().__init__()
        self._lowpasses = LowPassFilters(cutoffs, stride, pad, zeros, fft)

    @property
    def cutoffs(self):
        return self._lowpasses.cutoffs

    @property
    def stride(self):
        return self._lowpasses.stride

    @property
    def pad(self):
        return self._lowpasses.pad

    @property
    def zeros(self):
        return self._lowpasses.zeros

    @property
    def fft(self):
        return self._lowpasses.fft

    def forward(self, _input):
        lows = self._lowpasses(_input)

        # We need to extract the right portion of the _input in case
        # pad is False or stride > 1
        if self.pad:
            start, end = 0, _input.shape[-1]
        else:
            start = self._lowpasses.half_size
            end = -start
        _input = _input[..., start:end:self.stride]
        highs = _input - lows
        return highs


class HighPassFilter(paddle.nn.Layer):
    """
    Same as `HighPassFilters` but applies a single high pass filter.

    Shape:

        - Input: `[*, T]`
        - Output: `[*, T']`, with `T'=T` if `pad` is True and `stride` is 1.

    >>> highpass = HighPassFilter(1/4, stride=1)
    >>> x = paddle.randn([4, 124])
    >>> list(highpass(x).shape)
    [4, 124]
    """

    def __init__(self,
                 cutoff: float,
                 stride: int=1,
                 pad: bool=True,
                 zeros: float=8,
                 fft: Optional[bool]=None):
        super().__init__()
        self._highpasses = HighPassFilters([cutoff], stride, pad, zeros, fft)

    @property
    def cutoff(self):
        return self._highpasses.cutoffs[0]

    @property
    def stride(self):
        return self._highpasses.stride

    @property
    def pad(self):
        return self._highpasses.pad

    @property
    def zeros(self):
        return self._highpasses.zeros

    @property
    def fft(self):
        return self._highpasses.fft

    def forward(self, _input):
        return self._highpasses(_input)[0]


def highpass_filters(
        _input: paddle.Tensor,
        cutoffs: Sequence[float],
        stride: int=1,
        pad: bool=True,
        zeros: float=8,
        fft: Optional[bool]=None, ):
    """
    Functional version of `HighPassFilters`, refer to this class for more information.
    """
    return HighPassFilters(cutoffs, stride, pad, zeros, fft)(_input)


def highpass_filter(_input: paddle.Tensor,
                    cutoff: float,
                    stride: int=1,
                    pad: bool=True,
                    zeros: float=8,
                    fft: Optional[bool]=None):
    """
    Functional version of `HighPassFilter`, refer to this class for more information.
    Output will not have a dimension inserted in the front.
    """
    return highpass_filters(_input, [cutoff], stride, pad, zeros, fft)[0]


class SplitBands(paddle.nn.Layer):
    """
    Decomposes a signal over the given frequency bands in the waveform domain using
    a cascade of low pass filters as implemented by `julius.lowpass.LowPassFilters`.
    You can either specify explicitly the frequency cutoffs, or just the number of bands,
    in which case the frequency cutoffs will be spread out evenly in mel scale.

    Args:
        sample_rate (float): Sample rate of the input signal in Hz.
        n_bands (int or None): number of bands, when not giving them explicitly with `cutoffs`.
            In that case, the cutoff frequencies will be evenly spaced in mel-space.
        cutoffs (list[float] or None): list of frequency cutoffs in Hz.
        pad (bool): if True, appropriately pad the input with zero over the edge. If `stride=1`,
            the output will have the same length as the input.
        zeros (float): Number of zero crossings to keep. See `LowPassFilters` for more informations.
        fft (bool or None): See `LowPassFilters` for more info.

    ..note::
        The sum of all the bands will always be the input signal.

    ..warning::
        Unlike `julius.lowpass.LowPassFilters`, the cutoffs frequencies must be provided in Hz along
        with the sample rate.

    Shape:

        - Input: `[*, T]`
        - Output: `[B, *, T']`, with `T'=T` if `pad` is True.
            If `n_bands` was provided, `B = n_bands` otherwise `B = len(cutoffs) + 1`

    >>> bands = SplitBands(sample_rate=128, n_bands=10)
    >>> x = paddle.randn(shape=[6, 4, 1024])
    >>> list(bands(x).shape)
    [10, 6, 4, 1024]
    """

    def __init__(
            self,
            sample_rate: float,
            n_bands: Optional[int]=None,
            cutoffs: Optional[Sequence[float]]=None,
            pad: bool=True,
            zeros: float=8,
            fft: Optional[bool]=None, ):
        super().__init__()
        if (cutoffs is None) + (n_bands is None) != 1:
            raise ValueError(
                "You must provide either n_bands, or cutoffs, but not both.")

        self.sample_rate = sample_rate
        self.n_bands = n_bands
        self._cutoffs = list(cutoffs) if cutoffs is not None else None
        self.pad = pad
        self.zeros = zeros
        self.fft = fft

        if cutoffs is None:
            if n_bands is None:
                raise ValueError("You must provide one of n_bands or cutoffs.")
            if not n_bands >= 1:
                raise ValueError(
                    f"n_bands must be greater than one (got {n_bands})")
            cutoffs = paddle.audio.functional.mel_frequencies(
                n_bands + 1, 0, sample_rate / 2)[1:-1]
        else:
            if max(cutoffs) > 0.5 * sample_rate:
                raise ValueError(
                    "A cutoff above sample_rate/2 does not make sense.")
        if len(cutoffs) > 0:
            self.lowpass = LowPassFilters(
                [c / sample_rate for c in cutoffs],
                pad=pad,
                zeros=zeros,
                fft=fft)
        else:
            self.lowpass = None  # type: ignore

    def forward(self, input):
        if self.lowpass is None:
            return input[None]
        lows = self.lowpass(input)
        low = lows[0]
        bands = [low]
        for low_and_band in lows[1:]:
            # Get a bandpass filter by subtracting lowpasses
            band = low_and_band - low
            bands.append(band)
            low = low_and_band
        # Last band is whatever is left in the signal
        bands.append(input - low)
        return paddle.stack(bands)

    @property
    def cutoffs(self):
        if self._cutoffs is not None:
            return self._cutoffs
        elif self.lowpass is not None:
            return [c * self.sample_rate for c in self.lowpass.cutoffs]
        else:
            return []


def split_bands(
        signal: paddle.Tensor,
        sample_rate: float,
        n_bands: Optional[int]=None,
        cutoffs: Optional[Sequence[float]]=None,
        pad: bool=True,
        zeros: float=8,
        fft: Optional[bool]=None, ):
    """
    Functional version of `SplitBands`, refer to this class for more information.

    >>> x = paddle.randn(shape=[6, 4, 1024])
    >>> list(split_bands(x, sample_rate=64, cutoffs=[12, 24]).shape)
    [3, 6, 4, 1024]
    """
    return SplitBands(sample_rate, n_bands, cutoffs, pad, zeros, fft)(signal)