fix paddle2.5 verion Q

pull/3900/head
drryanhuang 9 months ago
parent 5e0f85a738
commit 4ebc75cf10

@ -9,6 +9,7 @@ evaluated with a stride of 1.
""" """
import inspect import inspect
import math import math
import sys
import typing import typing
from typing import Optional from typing import Optional
from typing import Sequence from typing import Sequence
@ -16,9 +17,12 @@ from typing import Sequence
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
sys.path.append("/home/aistudio/PaddleSpeech")
from paddlespeech.t2s.modules import fft_conv1d
from paddlespeech.t2s.modules import FFTConv1D
__all__ = [ __all__ = [
'fft_conv1d', 'FFTConv1d', 'highpass_filter', 'highpass_filters', 'fft_conv1d', 'FFTConv1D', 'highpass_filter', 'highpass_filters',
'lowpass_filter', 'LowPassFilter', 'LowPassFilters', 'pure_tone', 'lowpass_filter', 'LowPassFilter', 'LowPassFilters', 'pure_tone',
'resample_frac', 'split_bands', 'SplitBands' 'resample_frac', 'split_bands', 'SplitBands'
] ]
@ -243,216 +247,209 @@ def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):
return paddle.cos(2 * math.pi * freq * time) return paddle.cos(2 * math.pi * freq * time)
def unfold(_input, kernel_size: int, stride: int): # def unfold(_input, kernel_size: int, stride: int):
"""1D only unfolding similar to the one from PyTorch. # """1D only unfolding similar to the one from PyTorch.
However PyTorch unfold is extremely slow. # However PyTorch unfold is extremely slow.
Given an _input tensor of size `[*, T]` this will return # Given an _input tensor of size `[*, T]` this will return
a tensor `[*, F, K]` with `K` the kernel size, and `F` the number # a tensor `[*, F, K]` with `K` the kernel size, and `F` the number
of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`. # of frames. The i-th frame is a view onto `i * stride: i * stride + kernel_size`.
This will automatically pad the _input to cover at least once all entries in `_input`. # This will automatically pad the _input to cover at least once all entries in `_input`.
Args: # Args:
_input (Tensor): tensor for which to return the frames. # _input (Tensor): tensor for which to return the frames.
kernel_size (int): size of each frame. # kernel_size (int): size of each frame.
stride (int): stride between each frame. # stride (int): stride between each frame.
Shape: # Shape:
- Inputs: `_input` is `[*, T]` # - Inputs: `_input` is `[*, T]`
- Output: `[*, F, kernel_size]` with `F = 1 + ceil((T - kernel_size) / stride)` # - Output: `[*, F, kernel_size]` with `F = 1 + ceil((T - kernel_size) / stride)`
# ..Warning:: unlike PyTorch unfold, this will pad the _input
..Warning:: unlike PyTorch unfold, this will pad the _input # so that any position in `_input` is covered by at least one frame.
so that any position in `_input` is covered by at least one frame. # """
""" # shape = list(_input.shape)
shape = list(_input.shape) # length = shape.pop(-1)
length = shape.pop(-1) # n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1
n_frames = math.ceil((max(length, kernel_size) - kernel_size) / stride) + 1 # tgt_length = (n_frames - 1) * stride + kernel_size
tgt_length = (n_frames - 1) * stride + kernel_size # padded = F.pad(_input, (0, tgt_length - length), data_format="NCL")
padded = F.pad(_input, (0, tgt_length - length), data_format="NCL") # strides: typing.List[int] = []
strides: typing.List[int] = [] # for dim in range(padded.dim()):
for dim in range(padded.dim()): # strides.append(padded.strides[dim])
strides.append(padded.strides[dim]) # assert strides.pop(-1) == 1, "data should be contiguous"
assert strides.pop(-1) == 1, "data should be contiguous" # strides = strides + [stride, 1]
strides = strides + [stride, 1] # return padded.as_strided(shape + [n_frames, kernel_size], strides)
return padded.as_strided(shape + [n_frames, kernel_size], strides)
# def _new_rfft(x: paddle.Tensor):
# z = paddle.fft.rfft(x, axis=-1)
def _new_rfft(x: paddle.Tensor):
z = paddle.fft.rfft(x, axis=-1) # z_real = paddle.real(z)
# z_imag = paddle.imag(z)
z_real = paddle.real(z)
z_imag = paddle.imag(z) # z_view_as_real = paddle.stack([z_real, z_imag], axis=-1)
# return z_view_as_real
z_view_as_real = paddle.stack([z_real, z_imag], axis=-1)
return z_view_as_real # def _new_irfft(x: paddle.Tensor, length: int):
# x_real = x[..., 0]
# x_imag = x[..., 1]
def _new_irfft(x: paddle.Tensor, length: int): # x_view_as_complex = paddle.complex(x_real, x_imag)
x_real = x[..., 0] # return paddle.fft.irfft(x_view_as_complex, n=length, axis=-1)
x_imag = x[..., 1]
x_view_as_complex = paddle.complex(x_real, x_imag) # def _compl_mul_conjugate(a: paddle.Tensor, b: paddle.Tensor):
return paddle.fft.irfft(x_view_as_complex, n=length, axis=-1) # """
# Given a and b two tensors of dimension 4
# with the last dimension being the real and imaginary part,
def _compl_mul_conjugate(a: paddle.Tensor, b: paddle.Tensor): # returns a multiplied by the conjugate of b, the multiplication
""" # being with respect to the second dimension.
Given a and b two tensors of dimension 4
with the last dimension being the real and imaginary part, # PaddlePaddle does not have direct support for complex number operations
returns a multiplied by the conjugate of b, the multiplication # using einsum in the same manner as PyTorch, but we can manually compute
being with respect to the second dimension. # the equivalent result.
# """
PaddlePaddle does not have direct support for complex number operations # # Extract the real and imaginary parts of a and b
using einsum in the same manner as PyTorch, but we can manually compute # real_a = a[..., 0]
the equivalent result. # imag_a = a[..., 1]
""" # real_b = b[..., 0]
# Extract the real and imaginary parts of a and b # imag_b = b[..., 1]
real_a = a[..., 0]
imag_a = a[..., 1] # # Compute the multiplication with respect to the second dimension manually
real_b = b[..., 0] # real_part = paddle.einsum("bcft,dct->bdft", real_a, real_b) + paddle.einsum(
imag_b = b[..., 1] # "bcft,dct->bdft", imag_a, imag_b)
# imag_part = paddle.einsum("bcft,dct->bdft", imag_a, real_b) - paddle.einsum(
# Compute the multiplication with respect to the second dimension manually # "bcft,dct->bdft", real_a, imag_b)
real_part = paddle.einsum("bcft,dct->bdft", real_a, real_b) + paddle.einsum(
"bcft,dct->bdft", imag_a, imag_b) # # Stack the real and imaginary parts together
imag_part = paddle.einsum("bcft,dct->bdft", imag_a, real_b) - paddle.einsum( # result = paddle.stack([real_part, imag_part], axis=-1)
"bcft,dct->bdft", real_a, imag_b) # return result
# Stack the real and imaginary parts together # def fft_conv1d(
result = paddle.stack([real_part, imag_part], axis=-1) # _input: paddle.Tensor,
return result # weight: paddle.Tensor,
# bias: Optional[paddle.Tensor]=None,
# stride: int=1,
def fft_conv1d( # padding: int=0,
_input: paddle.Tensor, # block_ratio: float=5, ):
weight: paddle.Tensor, # """
bias: Optional[paddle.Tensor]=None, # Same as `paddle.nn.functional.conv1d` but using FFT for the convolution.
stride: int=1, # Please check PaddlePaddle documentation for more information.
padding: int=0,
block_ratio: float=5, ): # Args:
""" # _input (Tensor): _input signal of shape `[B, C, T]`.
Same as `paddle.nn.functional.conv1d` but using FFT for the convolution. # weight (Tensor): weight of the convolution `[D, C, K]` with `D` the number
Please check PaddlePaddle documentation for more information. # of output channels.
# bias (Tensor or None): if not None, bias term for the convolution.
Args: # stride (int): stride of convolution.
_input (Tensor): _input signal of shape `[B, C, T]`. # padding (int): padding to apply to the _input.
weight (Tensor): weight of the convolution `[D, C, K]` with `D` the number # block_ratio (float): can be tuned for speed. The _input is splitted in chunks
of output channels. # with a size of `int(block_ratio * kernel_size)`.
bias (Tensor or None): if not None, bias term for the convolution.
stride (int): stride of convolution. # Shape:
padding (int): padding to apply to the _input.
block_ratio (float): can be tuned for speed. The _input is splitted in chunks # - Inputs: `_input` is `[B, C, T]`, `weight` is `[D, C, K]` and bias is `[D]`.
with a size of `int(block_ratio * kernel_size)`. # - Output: `(*, T)`
Shape: # ..note::
# This function is faster than `paddle.nn.functional.conv1d` only in specific cases.
- Inputs: `_input` is `[B, C, T]`, `weight` is `[D, C, K]` and bias is `[D]`. # Typically, the kernel size should be of the order of 256 to see any real gain,
- Output: `(*, T)` # for a stride of 1.
# ..Warning::
..note:: # Dilation and groups are not supported at the moment. This function might use
This function is faster than `paddle.nn.functional.conv1d` only in specific cases. # more memory than the default Conv1d implementation.
Typically, the kernel size should be of the order of 256 to see any real gain, # """
for a stride of 1. # _input = F.pad(_input, (padding, padding), data_format="NCL")
# batch, channels, length = _input.shape
..Warning:: # out_channels, _, kernel_size = weight.shape
Dilation and groups are not supported at the moment. This function might use
more memory than the default Conv1d implementation. # if length < kernel_size:
""" # raise RuntimeError(
_input = F.pad(_input, (padding, padding), data_format="NCL") # f"Input should be at least as large as the kernel size {kernel_size}, "
batch, channels, length = _input.shape # f"but it is only {length} samples long.")
out_channels, _, kernel_size = weight.shape # if block_ratio < 1:
# raise RuntimeError("Block ratio must be greater than 1.")
if length < kernel_size:
raise RuntimeError( # block_size: int = min(int(kernel_size * block_ratio), length)
f"Input should be at least as large as the kernel size {kernel_size}, " # fold_stride = block_size - kernel_size + 1
f"but it is only {length} samples long.") # weight = pad_to(weight, block_size)
if block_ratio < 1: # weight_z = _new_rfft(weight)
raise RuntimeError("Block ratio must be greater than 1.")
# # We pad the _input and get the different frames, on which
block_size: int = min(int(kernel_size * block_ratio), length) # frames = unfold(_input, block_size, fold_stride)
fold_stride = block_size - kernel_size + 1
weight = pad_to(weight, block_size) # frames_z = _new_rfft(frames)
weight_z = _new_rfft(weight) # out_z = _compl_mul_conjugate(frames_z, weight_z)
# out = _new_irfft(out_z, block_size)
# We pad the _input and get the different frames, on which # # The last bit is invalid, because FFT will do a circular convolution.
frames = unfold(_input, block_size, fold_stride) # out = out[..., :-kernel_size + 1]
# out = out.reshape([batch, out_channels, -1])
frames_z = _new_rfft(frames) # out = out[..., ::stride]
out_z = _compl_mul_conjugate(frames_z, weight_z) # target_length = (length - kernel_size) // stride + 1
out = _new_irfft(out_z, block_size) # out = out[..., :target_length]
# The last bit is invalid, because FFT will do a circular convolution. # if bias is not None:
out = out[..., :-kernel_size + 1] # out += bias[:, None]
out = out.reshape([batch, out_channels, -1]) # return out
out = out[..., ::stride]
target_length = (length - kernel_size) // stride + 1 # class FFTConv1d(paddle.nn.Layer):
out = out[..., :target_length] # """
if bias is not None: # Same as `paddle.nn.Conv1D` but based on a custom FFT-based convolution.
out += bias[:, None] # Please check PaddlePaddle documentation for more information on `paddle.nn.Conv1D`.
return out
# Args:
# in_channels (int): number of _input channels.
class FFTConv1d(paddle.nn.Layer): # out_channels (int): number of output channels.
""" # kernel_size (int): kernel size of convolution.
Same as `paddle.nn.Conv1D` but based on a custom FFT-based convolution. # stride (int): stride of convolution.
Please check PaddlePaddle documentation for more information on `paddle.nn.Conv1D`. # padding (int): padding to apply to the _input.
# bias (bool): if True, use a bias term.
Args:
in_channels (int): number of _input channels. # ..note::
out_channels (int): number of output channels. # This module is faster than `paddle.nn.Conv1D` only in specific cases.
kernel_size (int): kernel size of convolution. # Typically, `kernel_size` should be of the order of 256 to see any real gain,
stride (int): stride of convolution. # for a stride of 1.
padding (int): padding to apply to the _input.
bias (bool): if True, use a bias term. # ..warning::
# Dilation and groups are not supported at the moment. This module might use
..note:: # more memory than the default Conv1D implementation.
This module is faster than `paddle.nn.Conv1D` only in specific cases.
Typically, `kernel_size` should be of the order of 256 to see any real gain, # >>> fftconv = FFTConv1d(12, 24, 128, 4)
for a stride of 1. # >>> x = paddle.randn([4, 12, 1024])
# >>> print(list(fftconv(x).shape))
..warning:: # [4, 24, 225]
Dilation and groups are not supported at the moment. This module might use # """
more memory than the default Conv1D implementation.
# def __init__(
>>> fftconv = FFTConv1d(12, 24, 128, 4) # self,
>>> x = paddle.randn([4, 12, 1024]) # in_channels: int,
>>> print(list(fftconv(x).shape)) # out_channels: int,
[4, 24, 225] # kernel_size: int,
""" # stride: int=1,
# padding: int=0,
def __init__( # bias: bool=True, ):
self, # super(FFTConv1d, self).__init__()
in_channels: int, # self.in_channels = in_channels
out_channels: int, # self.out_channels = out_channels
kernel_size: int, # self.kernel_size = kernel_size
stride: int=1, # self.stride = stride
padding: int=0, # self.padding = padding
bias: bool=True, ):
super(FFTConv1d, self).__init__() # # Create a Conv1D layer to initialize weights and bias
self.in_channels = in_channels # conv = paddle.nn.Conv1D(
self.out_channels = out_channels # in_channels,
self.kernel_size = kernel_size # out_channels,
self.stride = stride # kernel_size,
self.padding = padding # stride=stride,
# padding=padding,
# Create a Conv1D layer to initialize weights and bias # bias_attr=bias)
conv = paddle.nn.Conv1D( # self.weight = conv.weight
in_channels, # if bias:
out_channels, # self.bias = conv.bias
kernel_size, # else:
stride=stride, # self.bias = None
padding=padding,
bias_attr=bias) # def forward(self, _input: paddle.Tensor):
self.weight = conv.weight # return fft_conv1d(_input, self.weight, self.bias, self.stride,
if bias: # self.padding)
self.bias = conv.bias
else:
self.bias = None
def forward(self, _input: paddle.Tensor):
return fft_conv1d(_input, self.weight, self.bias, self.stride,
self.padding)
class LowPassFilters(nn.Layer): class LowPassFilters(nn.Layer):

@ -784,10 +784,12 @@ class AudioSignal(
if self.stft_data is not None: if self.stft_data is not None:
self.stft_data = self.stft_data.to(device) self.stft_data = self.stft_data.to(device)
if self.audio_data is not None: if self.audio_data is not None:
if 'cpu' == device: if device is None or "" == device:
return self
elif 'cpu' == device:
device = paddle.to_tensor( device = paddle.to_tensor(
self.audio_data, place=paddle.CPUPlace()) self.audio_data, place=paddle.CPUPlace())
if 'gpu' == device or 'cuda' == device: elif 'gpu' == device or 'cuda' == device:
device = paddle.to_tensor( device = paddle.to_tensor(
self.audio_data, place=paddle.CUDAPlace()) self.audio_data, place=paddle.CUDAPlace())
device = device.replace("cuda", device = device.replace("cuda",

@ -602,7 +602,7 @@ class Equalizer(BaseTransform):
# class Quantization(BaseTransform): # class Quantization(BaseTransform):
# """Applies quantization to the input waveform. Corresponds # """Applies quantization to the input waveform. Corresponds
# to :py:func:`audiotools.core.effects.EffectMixin.quantization`. # to :py:func:`audiotools.core.effects.EffectMixin.quantization`.
# Parameters # Parameters

Loading…
Cancel
Save