From 4ebc75cf10cd18be87d3cec79c0ba9ce9d6063d3 Mon Sep 17 00:00:00 2001
From: drryanhuang <zihaohuang@aliyun.com>
Date: Mon, 30 Dec 2024 08:02:42 +0000
Subject: [PATCH] fix paddle2.5 verion Q

---
 audio/audiotools/core/_julius.py      | 419 +++++++++++++-------------
 audio/audiotools/core/audio_signal.py |   6 +-
 audio/audiotools/data/transforms.py   |   2 +-
 3 files changed, 213 insertions(+), 214 deletions(-)

diff --git a/audio/audiotools/core/_julius.py b/audio/audiotools/core/_julius.py
index fc137c569..d6052b268 100644
--- a/audio/audiotools/core/_julius.py
+++ b/audio/audiotools/core/_julius.py
@@ -9,6 +9,7 @@ evaluated with a stride of 1.
 """
 import inspect
 import math
+import sys
 import typing
 from typing import Optional
 from typing import Sequence
@@ -16,9 +17,12 @@ from typing import Sequence
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
+sys.path.append("/home/aistudio/PaddleSpeech")
+from paddlespeech.t2s.modules import fft_conv1d
+from paddlespeech.t2s.modules import FFTConv1D
 
 __all__ = [
-    'fft_conv1d', 'FFTConv1d', 'highpass_filter', 'highpass_filters',
+    'fft_conv1d', 'FFTConv1D', 'highpass_filter', 'highpass_filters',
     'lowpass_filter', 'LowPassFilter', 'LowPassFilters', 'pure_tone',
     'resample_frac', 'split_bands', 'SplitBands'
 ]
@@ -243,216 +247,209 @@ def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):
     return paddle.cos(2 * math.pi * freq * time)
 
 
-def unfold(_input, kernel_size: int, stride: int):
-    """1D only unfolding similar to the one from PyTorch.
-    However PyTorch unfold is extremely slow.
-
-    Given an _input tensor of size `[*, T]` this will return
-    a tensor `[*, F, K]` with `K` the kernel size, and `F` the number
-    of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`.
-    This will automatically pad the _input to cover at least once all entries in `_input`.
-
-    Args:
-        _input (Tensor): tensor for which to return the frames.
-        kernel_size (int): size of each frame.
-        stride (int): stride between each frame.
-
-    Shape:
-
-        - Inputs: `_input` is `[*, T]`
-        - Output: `[*, F, kernel_size]` with `F = 1 + ceil((T - kernel_size) / stride)`
-
-
-    ..Warning:: unlike PyTorch unfold, this will pad the _input
-        so that any position in `_input` is covered by at least one frame.
-    """
-    shape = list(_input.shape)
-    length = shape.pop(-1)
-    n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
-    tgt_length = (n_frames - 1) * stride + kernel_size
-    padded = F.pad(_input, (0, tgt_length - length), data_format="NCL")
-    strides: typing.List[int] = []
-    for dim in range(padded.dim()):
-        strides.append(padded.strides[dim])
-    assert strides.pop(-1) == 1, "data should be contiguous"
-    strides = strides + [stride, 1]
-    return padded.as_strided(shape + [n_frames, kernel_size], strides)
-
-
-def _new_rfft(x: paddle.Tensor):
-    z = paddle.fft.rfft(x, axis=-1)
-
-    z_real = paddle.real(z)
-    z_imag = paddle.imag(z)
-
-    z_view_as_real = paddle.stack([z_real, z_imag], axis=-1)
-    return z_view_as_real
-
-
-def _new_irfft(x: paddle.Tensor, length: int):
-    x_real = x[..., 0]
-    x_imag = x[..., 1]
-    x_view_as_complex = paddle.complex(x_real, x_imag)
-    return paddle.fft.irfft(x_view_as_complex, n=length, axis=-1)
-
-
-def _compl_mul_conjugate(a: paddle.Tensor, b: paddle.Tensor):
-    """
-    Given a and b two tensors of dimension 4
-    with the last dimension being the real and imaginary part,
-    returns a multiplied by the conjugate of b, the multiplication
-    being with respect to the second dimension.
-
-    PaddlePaddle does not have direct support for complex number operations
-    using einsum in the same manner as PyTorch, but we can manually compute
-    the equivalent result.
-    """
-    # Extract the real and imaginary parts of a and b
-    real_a = a[..., 0]
-    imag_a = a[..., 1]
-    real_b = b[..., 0]
-    imag_b = b[..., 1]
-
-    # Compute the multiplication with respect to the second dimension manually
-    real_part = paddle.einsum("bcft,dct->bdft", real_a, real_b) + paddle.einsum(
-        "bcft,dct->bdft", imag_a, imag_b)
-    imag_part = paddle.einsum("bcft,dct->bdft", imag_a, real_b) - paddle.einsum(
-        "bcft,dct->bdft", real_a, imag_b)
-
-    # Stack the real and imaginary parts together
-    result = paddle.stack([real_part, imag_part], axis=-1)
-    return result
-
-
-def fft_conv1d(
-        _input: paddle.Tensor,
-        weight: paddle.Tensor,
-        bias: Optional[paddle.Tensor]=None,
-        stride: int=1,
-        padding: int=0,
-        block_ratio: float=5, ):
-    """
-    Same as `paddle.nn.functional.conv1d` but using FFT for the convolution.
-    Please check PaddlePaddle documentation for more information.
-
-    Args:
-        _input (Tensor): _input signal of shape `[B, C, T]`.
-        weight (Tensor): weight of the convolution `[D, C, K]` with `D` the number
-            of output channels.
-        bias (Tensor or None): if not None, bias term for the convolution.
-        stride (int): stride of convolution.
-        padding (int): padding to apply to the _input.
-        block_ratio (float): can be tuned for speed. The _input is splitted in chunks
-            with a size of `int(block_ratio * kernel_size)`.
-
-    Shape:
-
-        - Inputs: `_input` is `[B, C, T]`, `weight` is `[D, C, K]` and bias is `[D]`.
-        - Output: `(*, T)`
-
-
-    ..note::
-        This function is faster than `paddle.nn.functional.conv1d` only in specific cases.
-        Typically, the kernel size should be of the order of 256 to see any real gain,
-        for a stride of 1.
-
-    ..Warning::
-        Dilation and groups are not supported at the moment. This function might use
-        more memory than the default Conv1d implementation.
-    """
-    _input = F.pad(_input, (padding, padding), data_format="NCL")
-    batch, channels, length = _input.shape
-    out_channels, _, kernel_size = weight.shape
-
-    if length < kernel_size:
-        raise RuntimeError(
-            f"Input should be at least as large as the kernel size {kernel_size}, "
-            f"but it is only {length} samples long.")
-    if block_ratio < 1:
-        raise RuntimeError("Block ratio must be greater than 1.")
-
-    block_size: int = min(int(kernel_size * block_ratio), length)
-    fold_stride = block_size - kernel_size + 1
-    weight = pad_to(weight, block_size)
-    weight_z = _new_rfft(weight)
-
-    # We pad the _input and get the different frames, on which
-    frames = unfold(_input, block_size, fold_stride)
-
-    frames_z = _new_rfft(frames)
-    out_z = _compl_mul_conjugate(frames_z, weight_z)
-    out = _new_irfft(out_z, block_size)
-    # The last bit is invalid, because FFT will do a circular convolution.
-    out = out[..., :-kernel_size + 1]
-    out = out.reshape([batch, out_channels, -1])
-    out = out[..., ::stride]
-    target_length = (length - kernel_size) // stride + 1
-    out = out[..., :target_length]
-    if bias is not None:
-        out += bias[:, None]
-    return out
-
-
-class FFTConv1d(paddle.nn.Layer):
-    """
-    Same as `paddle.nn.Conv1D` but based on a custom FFT-based convolution.
-    Please check PaddlePaddle documentation for more information on `paddle.nn.Conv1D`.
-
-    Args:
-        in_channels (int): number of _input channels.
-        out_channels (int): number of output channels.
-        kernel_size (int): kernel size of convolution.
-        stride (int): stride of convolution.
-        padding (int): padding to apply to the _input.
-        bias (bool): if True, use a bias term.
-
-    ..note::
-        This module is faster than `paddle.nn.Conv1D` only in specific cases.
-        Typically, `kernel_size` should be of the order of 256 to see any real gain,
-        for a stride of 1.
-
-    ..warning::
-        Dilation and groups are not supported at the moment. This module might use
-        more memory than the default Conv1D implementation.
-
-    >>> fftconv = FFTConv1d(12, 24, 128, 4)
-    >>> x = paddle.randn([4, 12, 1024])
-    >>> print(list(fftconv(x).shape))
-    [4, 24, 225]
-    """
-
-    def __init__(
-            self,
-            in_channels: int,
-            out_channels: int,
-            kernel_size: int,
-            stride: int=1,
-            padding: int=0,
-            bias: bool=True, ):
-        super(FFTConv1d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-
-        # Create a Conv1D layer to initialize weights and bias
-        conv = paddle.nn.Conv1D(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            bias_attr=bias)
-        self.weight = conv.weight
-        if bias:
-            self.bias = conv.bias
-        else:
-            self.bias = None
-
-    def forward(self, _input: paddle.Tensor):
-        return fft_conv1d(_input, self.weight, self.bias, self.stride,
-                          self.padding)
+# def unfold(_input, kernel_size: int, stride: int):
+#     """1D only unfolding similar to the one from PyTorch.
+#     However PyTorch unfold is extremely slow.
+
+#     Given an _input tensor of size `[*, T]` this will return
+#     a tensor `[*, F, K]` with `K` the kernel size, and `F` the number
+#     of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`.
+#     This will automatically pad the _input to cover at least once all entries in `_input`.
+
+#     Args:
+#         _input (Tensor): tensor for which to return the frames.
+#         kernel_size (int): size of each frame.
+#         stride (int): stride between each frame.
+
+#     Shape:
+
+#         - Inputs: `_input` is `[*, T]`
+#         - Output: `[*, F, kernel_size]` with `F = 1 + ceil((T - kernel_size) / stride)`
+
+#     ..Warning:: unlike PyTorch unfold, this will pad the _input
+#         so that any position in `_input` is covered by at least one frame.
+#     """
+#     shape = list(_input.shape)
+#     length = shape.pop(-1)
+#     n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
+#     tgt_length = (n_frames - 1) * stride + kernel_size
+#     padded = F.pad(_input, (0, tgt_length - length), data_format="NCL")
+#     strides: typing.List[int] = []
+#     for dim in range(padded.dim()):
+#         strides.append(padded.strides[dim])
+#     assert strides.pop(-1) == 1, "data should be contiguous"
+#     strides = strides + [stride, 1]
+#     return padded.as_strided(shape + [n_frames, kernel_size], strides)
+
+# def _new_rfft(x: paddle.Tensor):
+#     z = paddle.fft.rfft(x, axis=-1)
+
+#     z_real = paddle.real(z)
+#     z_imag = paddle.imag(z)
+
+#     z_view_as_real = paddle.stack([z_real, z_imag], axis=-1)
+#     return z_view_as_real
+
+# def _new_irfft(x: paddle.Tensor, length: int):
+#     x_real = x[..., 0]
+#     x_imag = x[..., 1]
+#     x_view_as_complex = paddle.complex(x_real, x_imag)
+#     return paddle.fft.irfft(x_view_as_complex, n=length, axis=-1)
+
+# def _compl_mul_conjugate(a: paddle.Tensor, b: paddle.Tensor):
+#     """
+#     Given a and b two tensors of dimension 4
+#     with the last dimension being the real and imaginary part,
+#     returns a multiplied by the conjugate of b, the multiplication
+#     being with respect to the second dimension.
+
+#     PaddlePaddle does not have direct support for complex number operations
+#     using einsum in the same manner as PyTorch, but we can manually compute
+#     the equivalent result.
+#     """
+#     # Extract the real and imaginary parts of a and b
+#     real_a = a[..., 0]
+#     imag_a = a[..., 1]
+#     real_b = b[..., 0]
+#     imag_b = b[..., 1]
+
+#     # Compute the multiplication with respect to the second dimension manually
+#     real_part = paddle.einsum("bcft,dct->bdft", real_a, real_b) + paddle.einsum(
+#         "bcft,dct->bdft", imag_a, imag_b)
+#     imag_part = paddle.einsum("bcft,dct->bdft", imag_a, real_b) - paddle.einsum(
+#         "bcft,dct->bdft", real_a, imag_b)
+
+#     # Stack the real and imaginary parts together
+#     result = paddle.stack([real_part, imag_part], axis=-1)
+#     return result
+
+# def fft_conv1d(
+#         _input: paddle.Tensor,
+#         weight: paddle.Tensor,
+#         bias: Optional[paddle.Tensor]=None,
+#         stride: int=1,
+#         padding: int=0,
+#         block_ratio: float=5, ):
+#     """
+#     Same as `paddle.nn.functional.conv1d` but using FFT for the convolution.
+#     Please check PaddlePaddle documentation for more information.
+
+#     Args:
+#         _input (Tensor): _input signal of shape `[B, C, T]`.
+#         weight (Tensor): weight of the convolution `[D, C, K]` with `D` the number
+#             of output channels.
+#         bias (Tensor or None): if not None, bias term for the convolution.
+#         stride (int): stride of convolution.
+#         padding (int): padding to apply to the _input.
+#         block_ratio (float): can be tuned for speed. The _input is splitted in chunks
+#             with a size of `int(block_ratio * kernel_size)`.
+
+#     Shape:
+
+#         - Inputs: `_input` is `[B, C, T]`, `weight` is `[D, C, K]` and bias is `[D]`.
+#         - Output: `(*, T)`
+
+#     ..note::
+#         This function is faster than `paddle.nn.functional.conv1d` only in specific cases.
+#         Typically, the kernel size should be of the order of 256 to see any real gain,
+#         for a stride of 1.
+
+#     ..Warning::
+#         Dilation and groups are not supported at the moment. This function might use
+#         more memory than the default Conv1d implementation.
+#     """
+#     _input = F.pad(_input, (padding, padding), data_format="NCL")
+#     batch, channels, length = _input.shape
+#     out_channels, _, kernel_size = weight.shape
+
+#     if length < kernel_size:
+#         raise RuntimeError(
+#             f"Input should be at least as large as the kernel size {kernel_size}, "
+#             f"but it is only {length} samples long.")
+#     if block_ratio < 1:
+#         raise RuntimeError("Block ratio must be greater than 1.")
+
+#     block_size: int = min(int(kernel_size * block_ratio), length)
+#     fold_stride = block_size - kernel_size + 1
+#     weight = pad_to(weight, block_size)
+#     weight_z = _new_rfft(weight)
+
+#     # We pad the _input and get the different frames, on which
+#     frames = unfold(_input, block_size, fold_stride)
+
+#     frames_z = _new_rfft(frames)
+#     out_z = _compl_mul_conjugate(frames_z, weight_z)
+#     out = _new_irfft(out_z, block_size)
+#     # The last bit is invalid, because FFT will do a circular convolution.
+#     out = out[..., :-kernel_size + 1]
+#     out = out.reshape([batch, out_channels, -1])
+#     out = out[..., ::stride]
+#     target_length = (length - kernel_size) // stride + 1
+#     out = out[..., :target_length]
+#     if bias is not None:
+#         out += bias[:, None]
+#     return out
+
+# class FFTConv1d(paddle.nn.Layer):
+#     """
+#     Same as `paddle.nn.Conv1D` but based on a custom FFT-based convolution.
+#     Please check PaddlePaddle documentation for more information on `paddle.nn.Conv1D`.
+
+#     Args:
+#         in_channels (int): number of _input channels.
+#         out_channels (int): number of output channels.
+#         kernel_size (int): kernel size of convolution.
+#         stride (int): stride of convolution.
+#         padding (int): padding to apply to the _input.
+#         bias (bool): if True, use a bias term.
+
+#     ..note::
+#         This module is faster than `paddle.nn.Conv1D` only in specific cases.
+#         Typically, `kernel_size` should be of the order of 256 to see any real gain,
+#         for a stride of 1.
+
+#     ..warning::
+#         Dilation and groups are not supported at the moment. This module might use
+#         more memory than the default Conv1D implementation.
+
+#     >>> fftconv = FFTConv1d(12, 24, 128, 4)
+#     >>> x = paddle.randn([4, 12, 1024])
+#     >>> print(list(fftconv(x).shape))
+#     [4, 24, 225]
+#     """
+
+#     def __init__(
+#             self,
+#             in_channels: int,
+#             out_channels: int,
+#             kernel_size: int,
+#             stride: int=1,
+#             padding: int=0,
+#             bias: bool=True, ):
+#         super(FFTConv1d, self).__init__()
+#         self.in_channels = in_channels
+#         self.out_channels = out_channels
+#         self.kernel_size = kernel_size
+#         self.stride = stride
+#         self.padding = padding
+
+#         # Create a Conv1D layer to initialize weights and bias
+#         conv = paddle.nn.Conv1D(
+#             in_channels,
+#             out_channels,
+#             kernel_size,
+#             stride=stride,
+#             padding=padding,
+#             bias_attr=bias)
+#         self.weight = conv.weight
+#         if bias:
+#             self.bias = conv.bias
+#         else:
+#             self.bias = None
+
+#     def forward(self, _input: paddle.Tensor):
+#         return fft_conv1d(_input, self.weight, self.bias, self.stride,
+#                           self.padding)
 
 
 class LowPassFilters(nn.Layer):
diff --git a/audio/audiotools/core/audio_signal.py b/audio/audiotools/core/audio_signal.py
index ed57ada89..f50172b0d 100644
--- a/audio/audiotools/core/audio_signal.py
+++ b/audio/audiotools/core/audio_signal.py
@@ -784,10 +784,12 @@ class AudioSignal(
         if self.stft_data is not None:
             self.stft_data = self.stft_data.to(device)
         if self.audio_data is not None:
-            if 'cpu' == device:
+            if device is None or "" == device:
+                return self
+            elif 'cpu' == device:
                 device = paddle.to_tensor(
                     self.audio_data, place=paddle.CPUPlace())
-            if 'gpu' == device or 'cuda' == device:
+            elif 'gpu' == device or 'cuda' == device:
                 device = paddle.to_tensor(
                     self.audio_data, place=paddle.CUDAPlace())
             device = device.replace("cuda",
diff --git a/audio/audiotools/data/transforms.py b/audio/audiotools/data/transforms.py
index dcd714d8b..71d78fcf2 100644
--- a/audio/audiotools/data/transforms.py
+++ b/audio/audiotools/data/transforms.py
@@ -602,7 +602,7 @@ class Equalizer(BaseTransform):
 
 
 # class Quantization(BaseTransform):
-#     """❌Applies quantization to the input waveform. Corresponds
+#     """Applies quantization to the input waveform. Corresponds
 #     to :py:func:`audiotools.core.effects.EffectMixin.quantization`.
 
 #     Parameters