diff --git a/audio/audiotools/__init__.py b/audio/audiotools/__init__.py index 12ffa327f..2f71c32d9 100644 --- a/audio/audiotools/__init__.py +++ b/audio/audiotools/__init__.py @@ -1,12 +1,12 @@ -__version__ = "0.0.1" -from .core import AudioSignal -from .core import STFTParams -from .core import Meter -from .core import util -from .core import highpass_filter, highpass_filters -from . import metrics from . import data +from . import metrics from . import ml from . import post +from .core import AudioSignal +from .core import highpass_filter +from .core import highpass_filters +from .core import Meter +from .core import STFTParams +from .core import util from .data import datasets from .data import transforms diff --git a/audio/audiotools/core/_julius.py b/audio/audiotools/core/_julius.py index af70c3878..fdc1e277a 100644 --- a/audio/audiotools/core/_julius.py +++ b/audio/audiotools/core/_julius.py @@ -15,14 +15,14 @@ from typing import Sequence import paddle import paddle.nn as nn import paddle.nn.functional as F -sys.path.append("/home/aistudio/PaddleSpeech") + from paddlespeech.t2s.modules import fft_conv1d from paddlespeech.t2s.modules import FFTConv1D +from paddlespeech.utils import satisfy_paddle_version __all__ = [ - 'fft_conv1d', 'FFTConv1D', 'highpass_filter', 'highpass_filters', - 'lowpass_filter', 'LowPassFilter', 'LowPassFilters', 'pure_tone', - 'resample_frac', 'split_bands', 'SplitBands' + 'highpass_filter', 'highpass_filters', 'lowpass_filter', 'LowPassFilter', + 'LowPassFilters', 'pure_tone', 'resample_frac', 'split_bands', 'SplitBands' ] @@ -61,6 +61,9 @@ def sinc(x: paddle.Tensor): __Warning__: the input is not multiplied by `pi`! """ + if satisfy_paddle_version("2.6"): + return paddle.sinc(x) + return paddle.where( x == 0, paddle.to_tensor(1.0, dtype=x.dtype, place=x.place), @@ -103,7 +106,7 @@ class ResampleFrac(paddle.nn.Layer): >>> print(len(resample(x))) 1250 """ - super(ResampleFrac, self).__init__() + super().__init__() if not isinstance(old_sr, int) or not isinstance(new_sr, int): raise ValueError("old_sr and new_sr should be integers") gcd = math.gcd(old_sr, new_sr) @@ -257,7 +260,7 @@ class LowPassFilters(nn.Layer): zeros: float=8, fft: Optional[bool]=None, dtype="float32"): - super(LowPassFilters, self).__init__() + super().__init__() self.cutoffs = list(cutoffs) if min(self.cutoffs) < 0: raise ValueError("Minimum cutoff must be larger than zero.") @@ -325,7 +328,7 @@ class LowPassFilter(nn.Layer): pad: bool=True, zeros: float=8, fft: Optional[bool]=None): - super(LowPassFilter, self).__init__() + super().__init__() self._lowpasses = LowPassFilters([cutoff], stride, pad, zeros, fft) @property @@ -583,7 +586,7 @@ class SplitBands(paddle.nn.Layer): pad: bool=True, zeros: float=8, fft: Optional[bool]=None, ): - super(SplitBands, self).__init__() + super().__init__() if (cutoffs is None) + (n_bands is None) != 1: raise ValueError( "You must provide either n_bands, or cutoffs, but not both.") diff --git a/audio/audiotools/core/audio_signal.py b/audio/audiotools/core/audio_signal.py index 9d1faca20..50b46a4de 100644 --- a/audio/audiotools/core/audio_signal.py +++ b/audio/audiotools/core/audio_signal.py @@ -587,7 +587,7 @@ class AudioSignal( self.original_signal_length = self.signal_length self.sample_rate = sample_rate - # return self.to(device) + return self def write(self, audio_path: typing.Union[str, Path]): @@ -1198,7 +1198,6 @@ class AudioSignal( padding_type = self.stft_params.padding_type if padding_type is None else padding_type window = self.get_window(window_type, window_length) - # window = window.to(self.audio_data.device) audio_data = self.audio_data right_pad, pad = self.compute_stft_padding(window_length, hop_length, @@ -1362,6 +1361,26 @@ class AudioSignal( paddle.Tensor [shape=(batch, channels, mels, time)] Mel spectrogram. """ + # from paddle.audio.compliance.librosa import melspectrogram + # # from ..compliance.librosa import melspectrogram + # return melspectrogram( + # x=self.audio_data, + # sr=self.sample_rate, + # window_size: int=512, + # hop_length: int=320, + # n_mels: int=64, + # fmin: float=50.0, + # fmax: Optional[float]=None, + # window: str='hann', + # center: bool=True, + # pad_mode: str='reflect', + # power: float=2.0, + # to_db: bool=True, + # ref: float=1.0, + # amin: float=1e-10, + # top_db: Optional[float]=None + # ) + stft = self.stft(**kwargs) magnitude = paddle.abs(stft) @@ -1429,6 +1448,9 @@ class AudioSignal( MFCCs. """ + # from paddle.audio.compliance.librosa import mfcc + # return mfcc(self.audio_data, self.sample_rate, n_mfcc=n_mfcc, n_mels=n_mels) + mel_spectrogram = self.mel_spectrogram(n_mels, **kwargs) mel_spectrogram = paddle.log(mel_spectrogram + log_offset) dct_mat = self.get_dct(n_mfcc, n_mels, "ortho", self.device)